ebook_tools 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,7 +1,7 @@
1
- 0.0.1 2013.4.2
1
+ 0.0.3 2013.4.2
2
2
  add book id for docbook
3
3
 
4
- 0.0.1 2013.4.1
4
+ 0.0.2 2013.4.1
5
5
  change docbook format
6
6
 
7
7
  0.0.1 2013.4.1
data/bin/ebook_tools CHANGED
@@ -82,10 +82,12 @@ def help(command=nil)
82
82
  batch_convert: 批量转换指定目录中的文件为epub格式文件,并存放到目标目录
83
83
  batch_extract: 批量提取指定目录中文件的书结构信息,并生成Docbook存放到目标目录
84
84
 
85
- 适用对象要求:
86
- 编码格式为utf-8
87
-
88
85
  具体命令的更多信息请通过'ebook_tools help <command>'查看。
86
+
87
+ 其他命令:
88
+ sed -i 's/^\(\s*[0-9]\{1,3\}\)./\1 /' <file> : 将 "1.xxx" 格式替换成 "1 xxx"
89
+ iconv --from-code GB2312 -t UTF-8 -c -o <output file> <file> : 将GB2312格式文件转换成UTF-8
90
+ sed -i 's/\s*第.\{1,3\}章.*[0-9]\{1,3\}$//g' <file> : 将 "第xxx章 xxxx1" 格式清除。主要用于有些pdf文档转换过来时的页眉信息。
89
91
  EOF
90
92
  end
91
93
 
@@ -0,0 +1,72 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ require 'rubygems'
4
+ require 'optparse'
5
+ require File.join(File.expand_path('../../',__FILE__),'lib','utils')
6
+ require File.join(File.expand_path('../../',__FILE__),'lib','paras_in_mongo')
7
+
8
+ def help
9
+ puts <<-EOF
10
+ usage:
11
+ para_import_mongo [options] <docbook>
12
+
13
+ docbook: 已经标注重点段落的书文件或目录
14
+
15
+ options:
16
+ -H <host> , --host <host> : mongodb服务器,默认为localhost
17
+ -P <port> , --port <port> : mongodb服务器端口号,默认为27017 (Mongo默认端口号)
18
+ -D <database>, --database <database> : 重点段落要存放的数据库
19
+ -C <collection>, --collection <collection> : 重点段落存放的集合
20
+ EOF
21
+ exit
22
+ end
23
+
24
+
25
+ options = {:host=>'localhost',:port=>27017,:database=>'resource_development',:collection=>'paras'}
26
+ opts = OptionParser.new do |opts|
27
+ opts.on('-H host','--host host') do |host|
28
+ options[:host] = host
29
+ end
30
+
31
+ opts.on('-P port','--port port') do |port|
32
+ options[:port] = port.to_i
33
+ end
34
+
35
+ opts.on('-D database','--database database') do |database|
36
+ options[:database] = database
37
+ end
38
+
39
+ opts.on('-C collection','--collection collection') do |collection|
40
+ options[:collection] = collection
41
+ end
42
+
43
+ opts.on('-h','--help') do
44
+ help
45
+ end
46
+ end
47
+ opts.parse ARGV
48
+
49
+ docbook = ARGV[-1]
50
+
51
+ if docbook.nil?
52
+ help
53
+ end
54
+
55
+ unless File.exists?(docbook)
56
+ raise "错误:指定的docbook不存在。"
57
+ end
58
+
59
+ files = if File.directory?(docbook)
60
+ Utils.scan_file_from_dir(docbook,{:format=>'.xml'})
61
+ else
62
+ [docbook]
63
+ end
64
+
65
+ files.each do |file|
66
+ begin
67
+ ParasInMongo.file_in_mongo(file,options)
68
+ rescue
69
+ puts "error: #{file} import mongo failure!"
70
+ end
71
+ end
72
+ puts "success: #{docbook} in mongo successfully!"
data/ebook_tools.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{ebook_tools}
5
- s.version = '0.0.3'
5
+ s.version = '0.0.4'
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Aaron"]
9
- s.date = %q{2013-04-03}
9
+ s.date = %q{2013-04-05}
10
10
  s.description = %q{电子书工具集.}
11
11
  s.email = %q{aaron@nonobo.com}
12
12
  s.require_paths = ["lib"]
@@ -15,10 +15,12 @@ Gem::Specification.new do |s|
15
15
  s.has_rdoc = true
16
16
  s.rdoc_options = ["--charset=UTF-8"]
17
17
  s.executables << "ebook_tools"
18
+ s.executables << "para_import_mongo"
18
19
  s.files = [
19
20
  "README",
20
21
  "CHANGELOG",
21
22
  "bin/ebook_tools",
23
+ "bin/para_import_mongo",
22
24
  "lib/ebook_tools.rb",
23
25
  "lib/extract_book_struct.rb",
24
26
  "lib/header_detect.rb",
@@ -26,6 +28,7 @@ Gem::Specification.new do |s|
26
28
  "lib/txt.rb",
27
29
  "lib/epub.rb",
28
30
  "lib/utils.rb",
31
+ "lib/paras_in_mongo.rb",
29
32
  "ebook_tools.gemspec"
30
33
  ]
31
34
  s.add_dependency(%q<uuid>)
@@ -35,4 +38,5 @@ Gem::Specification.new do |s|
35
38
  s.add_dependency(%q<pdf-reader>)
36
39
  s.add_dependency(%q<nokogiri>)
37
40
  s.add_dependency(%q<levenshtein>)
41
+ s.add_dependency(%q<mongo>)
38
42
  end
@@ -0,0 +1,42 @@
1
+ # encoding: UTF-8
2
+ require 'mongo'
3
+ require 'nokogiri'
4
+
5
+ include Mongo
6
+ module ParasInMongo
7
+ extend self
8
+
9
+ def file_in_mongo(filename,options={})
10
+ client = MongoClient.new(options[:host], options[:port])
11
+ db = client[options[:database]]
12
+ coll = db[options[:collection]]
13
+
14
+ doc = Nokogiri::XML(File.open(filename).read)
15
+
16
+ book_id = doc.search("book")[0]['id']
17
+ title = doc.search("book info title")[0].text
18
+ author = doc.search("book info author")[0].text
19
+ pubdate = doc.search("book info pubdate")[0].text
20
+ publisher = doc.search("book info publisher")[0].text
21
+
22
+ paras = doc.search("para[key=yes]")
23
+
24
+ source ={book: {title: title,book_id: book_id, author: author,pubdate: pubdate, publisher: publisher}}
25
+
26
+ paras.each do |para|
27
+ para_attrs = {'_id' => para['id']}
28
+ content = para.search("content")[0].text
29
+
30
+ keywords = []
31
+ para.search("keyword").each do |keyword|
32
+ keywords << {keyword: keyword.text, weight: keyword['weight'].to_i}
33
+ end
34
+
35
+ para_attrs = para_attrs.merge(keywords: keywords, content: content)
36
+ section = para.parent.search("info title").text
37
+ para_attrs = para_attrs.merge(source: source.merge(location: {section: section}))
38
+
39
+ coll.insert(para_attrs)
40
+ end
41
+ end
42
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ebook_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-03 00:00:00.000000000 Z
12
+ date: 2013-04-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: uuid
@@ -123,16 +123,34 @@ dependencies:
123
123
  - - ! '>='
124
124
  - !ruby/object:Gem::Version
125
125
  version: '0'
126
+ - !ruby/object:Gem::Dependency
127
+ name: mongo
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :runtime
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
126
142
  description: 电子书工具集.
127
143
  email: aaron@nonobo.com
128
144
  executables:
129
145
  - ebook_tools
146
+ - para_import_mongo
130
147
  extensions: []
131
148
  extra_rdoc_files: []
132
149
  files:
133
150
  - README
134
151
  - CHANGELOG
135
152
  - bin/ebook_tools
153
+ - bin/para_import_mongo
136
154
  - lib/ebook_tools.rb
137
155
  - lib/extract_book_struct.rb
138
156
  - lib/header_detect.rb
@@ -140,6 +158,7 @@ files:
140
158
  - lib/txt.rb
141
159
  - lib/epub.rb
142
160
  - lib/utils.rb
161
+ - lib/paras_in_mongo.rb
143
162
  - ebook_tools.gemspec
144
163
  homepage:
145
164
  licenses: []