ebook_tools 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,7 +1,7 @@
1
- 0.0.1 2013.4.2
1
+ 0.0.3 2013.4.2
2
2
  add book id for docbook
3
3
 
4
- 0.0.1 2013.4.1
4
+ 0.0.2 2013.4.1
5
5
  change docbook format
6
6
 
7
7
  0.0.1 2013.4.1
data/bin/ebook_tools CHANGED
@@ -82,10 +82,12 @@ def help(command=nil)
82
82
  batch_convert: 批量转换指定目录中的文件为epub格式文件,并存放到目标目录
83
83
  batch_extract: 批量提取指定目录中文件的书结构信息,并生成Docbook存放到目标目录
84
84
 
85
- 适用对象要求:
86
- 编码格式为utf-8
87
-
88
85
  具体命令的更多信息请通过'ebook_tools help <command>'查看。
86
+
87
+ 其他命令:
88
+ sed -i 's/^\(\s*[0-9]\{1,3\}\)./\1 /' <file> : 将 "1.xxx" 格式替换成 "1 xxx"
89
+ iconv --from-code GB2312 -t UTF-8 -c -o <output file> <file> : 将GB2312格式文件转换成UTF-8
90
+ sed -i 's/\s*第.\{1,3\}章.*[0-9]\{1,3\}$//g' <file> : 将 "第xxx章 xxxx1" 格式清除。主要用于有些pdf文档转换过来时的页眉信息。
89
91
  EOF
90
92
  end
91
93
 
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ require 'rubygems'
4
+ require 'optparse'
5
+ require File.join(File.expand_path('../../',__FILE__),'lib','utils')
6
+ require File.join(File.expand_path('../../',__FILE__),'lib','paras_in_mongo')
7
+
8
+ def help
9
+ puts <<-EOF
10
+ usage:
11
+ para_import_mongo [options] <docbook>
12
+
13
+ docbook: 已经标注重点段落的书文件或目录
14
+
15
+ options:
16
+ -H <host> , --host <host> : mongodb服务器,默认为localhost
17
+ -P <port> , --port <port> : mongodb服务器端口号,默认为27017 (Mongo默认端口号)
18
+ -D <database>, --database <database> : 重点段落要存放的数据库
19
+ -C <collection>, --collection <collection> : 重点段落存放的集合
20
+ EOF
21
+ exit
22
+ end
23
+
24
+
25
+ options = {:host=>'localhost',:port=>27017,:database=>'resource_development',:collection=>'paras'}
26
+ opts = OptionParser.new do |opts|
27
+ opts.on('-H host','--host host') do |host|
28
+ options[:host] = host
29
+ end
30
+
31
+ opts.on('-P port','--port port') do |port|
32
+ options[:port] = port.to_i
33
+ end
34
+
35
+ opts.on('-D database','--database database') do |database|
36
+ options[:database] = database
37
+ end
38
+
39
+ opts.on('-C collection','--collection collection') do |collection|
40
+ options[:collection] = collection
41
+ end
42
+
43
+ opts.on('-h','--help') do
44
+ help
45
+ end
46
+ end
47
+ opts.parse ARGV
48
+
49
+ docbook = ARGV[-1]
50
+
51
+ if docbook.nil?
52
+ help
53
+ end
54
+
55
+ unless File.exists?(docbook)
56
+ raise "错误:指定的docbook不存在。"
57
+ end
58
+
59
+ files = if File.directory?(docbook)
60
+ Utils.scan_file_from_dir(docbook,{:format=>'.xml'})
61
+ else
62
+ [docbook]
63
+ end
64
+
65
+ files.each do |file|
66
+ begin
67
+ ParasInMongo.file_in_mongo(file,options)
68
+ rescue
69
+ puts "error: #{file} import mongo failure!"
70
+ end
71
+ end
72
+ puts "success: #{docbook} in mongo successfully!"
data/ebook_tools.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{ebook_tools}
5
- s.version = '0.0.3'
5
+ s.version = '0.0.4'
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Aaron"]
9
- s.date = %q{2013-04-03}
9
+ s.date = %q{2013-04-05}
10
10
  s.description = %q{电子书工具集.}
11
11
  s.email = %q{aaron@nonobo.com}
12
12
  s.require_paths = ["lib"]
@@ -15,10 +15,12 @@ Gem::Specification.new do |s|
15
15
  s.has_rdoc = true
16
16
  s.rdoc_options = ["--charset=UTF-8"]
17
17
  s.executables << "ebook_tools"
18
+ s.executables << "para_import_mongo"
18
19
  s.files = [
19
20
  "README",
20
21
  "CHANGELOG",
21
22
  "bin/ebook_tools",
23
+ "bin/para_import_mongo",
22
24
  "lib/ebook_tools.rb",
23
25
  "lib/extract_book_struct.rb",
24
26
  "lib/header_detect.rb",
@@ -26,6 +28,7 @@ Gem::Specification.new do |s|
26
28
  "lib/txt.rb",
27
29
  "lib/epub.rb",
28
30
  "lib/utils.rb",
31
+ "lib/paras_in_mongo.rb",
29
32
  "ebook_tools.gemspec"
30
33
  ]
31
34
  s.add_dependency(%q<uuid>)
@@ -35,4 +38,5 @@ Gem::Specification.new do |s|
35
38
  s.add_dependency(%q<pdf-reader>)
36
39
  s.add_dependency(%q<nokogiri>)
37
40
  s.add_dependency(%q<levenshtein>)
41
+ s.add_dependency(%q<mongo>)
38
42
  end
@@ -0,0 +1,42 @@
1
+ # encoding: UTF-8
2
+ require 'mongo'
3
+ require 'nokogiri'
4
+
5
+ include Mongo
6
+ module ParasInMongo
7
+ extend self
8
+
9
+ def file_in_mongo(filename,options={})
10
+ client = MongoClient.new(options[:host], options[:port])
11
+ db = client[options[:database]]
12
+ coll = db[options[:collection]]
13
+
14
+ doc = Nokogiri::XML(File.open(filename).read)
15
+
16
+ book_id = doc.search("book")[0]['id']
17
+ title = doc.search("book info title")[0].text
18
+ author = doc.search("book info author")[0].text
19
+ pubdate = doc.search("book info pubdate")[0].text
20
+ publisher = doc.search("book info publisher")[0].text
21
+
22
+ paras = doc.search("para[key=yes]")
23
+
24
+ source ={book: {title: title,book_id: book_id, author: author,pubdate: pubdate, publisher: publisher}}
25
+
26
+ paras.each do |para|
27
+ para_attrs = {'_id' => para['id']}
28
+ content = para.search("content")[0].text
29
+
30
+ keywords = []
31
+ para.search("keyword").each do |keyword|
32
+ keywords << {keyword: keyword.text, weight: keyword['weight'].to_i}
33
+ end
34
+
35
+ para_attrs = para_attrs.merge(keywords: keywords, content: content)
36
+ section = para.parent.search("info title").text
37
+ para_attrs = para_attrs.merge(source: source.merge(location: {section: section}))
38
+
39
+ coll.insert(para_attrs)
40
+ end
41
+ end
42
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ebook_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-03 00:00:00.000000000 Z
12
+ date: 2013-04-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: uuid
@@ -123,16 +123,34 @@ dependencies:
123
123
  - - ! '>='
124
124
  - !ruby/object:Gem::Version
125
125
  version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: mongo
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
126
142
  description: 电子书工具集.
127
143
  email: aaron@nonobo.com
128
144
  executables:
129
145
  - ebook_tools
146
+ - para_import_mongo
130
147
  extensions: []
131
148
  extra_rdoc_files: []
132
149
  files:
133
150
  - README
134
151
  - CHANGELOG
135
152
  - bin/ebook_tools
153
+ - bin/para_import_mongo
136
154
  - lib/ebook_tools.rb
137
155
  - lib/extract_book_struct.rb
138
156
  - lib/header_detect.rb
@@ -140,6 +158,7 @@ files:
140
158
  - lib/txt.rb
141
159
  - lib/epub.rb
142
160
  - lib/utils.rb
161
+ - lib/paras_in_mongo.rb
143
162
  - ebook_tools.gemspec
144
163
  homepage:
145
164
  licenses: []