ebook_tools 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +2 -2
- data/bin/ebook_tools +5 -3
- data/bin/para_import_mongo +72 -0
- data/ebook_tools.gemspec +6 -2
- data/lib/paras_in_mongo.rb +42 -0
- metadata +21 -2
data/CHANGELOG
CHANGED
data/bin/ebook_tools
CHANGED
@@ -82,10 +82,12 @@ def help(command=nil)
|
|
82
82
|
batch_convert: 批量转换指定目录中的文件为epub格式文件,并存放到目标目录
|
83
83
|
batch_extract: 批量提取指定目录中文件的书结构信息,并生成Docbook存放到目标目录
|
84
84
|
|
85
|
-
适用对象要求:
|
86
|
-
编码格式为utf-8
|
87
|
-
|
88
85
|
具体命令的更多信息请通过'ebook_tools help <command>'查看。
|
86
|
+
|
87
|
+
其他命令:
|
88
|
+
sed -i 's/^\(\s*[0-9]\{1,3\}\)./\1 /' <file> : 将 "1.xxx" 格式替换成 "1 xxx"
|
89
|
+
iconv --from-code GB2312 -t UTF-8 -c -o <output file> <file> : 将GB2312格式文件转换成UTF-8
|
90
|
+
sed -i 's/\s*第.\{1,3\}章.*[0-9]\{1,3\}$//g' <file> : 将 "第xxx章 xxxx1" 格式清除。主要用于有些pdf文档转换过来时的页眉信息。
|
89
91
|
EOF
|
90
92
|
end
|
91
93
|
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
require 'rubygems'
|
4
|
+
require 'optparse'
|
5
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','utils')
|
6
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','paras_in_mongo')
|
7
|
+
|
8
|
+
def help
|
9
|
+
puts <<-EOF
|
10
|
+
usage:
|
11
|
+
para_import_mongo [options] <docbook>
|
12
|
+
|
13
|
+
docbook: 已经标注重点段落的书文件或目录
|
14
|
+
|
15
|
+
options:
|
16
|
+
-H <host> , --host <host> : mongodb服务器,默认为localhost
|
17
|
+
-P <port> , --port <port> : mongodb服务器端口号,默认为27017 (Mongo默认端口号)
|
18
|
+
-D <database>, --database <database> : 重点段落要存放的数据库
|
19
|
+
-C <collection>, --collection <collection> : 重点段落存放的集合
|
20
|
+
EOF
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
options = {:host=>'localhost',:port=>27017,:database=>'resource_development',:collection=>'paras'}
|
26
|
+
opts = OptionParser.new do |opts|
|
27
|
+
opts.on('-H host','--host host') do |host|
|
28
|
+
options[:host] = host
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on('-P port','--port port') do |port|
|
32
|
+
options[:port] = port.to_i
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on('-D database','--database database') do |database|
|
36
|
+
options[:database] = database
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on('-C collection','--collection collection') do |collection|
|
40
|
+
options[:collection] = collection
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('-h','--help') do
|
44
|
+
help
|
45
|
+
end
|
46
|
+
end
|
47
|
+
opts.parse ARGV
|
48
|
+
|
49
|
+
docbook = ARGV[-1]
|
50
|
+
|
51
|
+
if docbook.nil?
|
52
|
+
help
|
53
|
+
end
|
54
|
+
|
55
|
+
unless File.exists?(docbook)
|
56
|
+
raise "错误:指定的docbook不存在。"
|
57
|
+
end
|
58
|
+
|
59
|
+
files = if File.directory?(docbook)
|
60
|
+
Utils.scan_file_from_dir(docbook,{:format=>'.xml'})
|
61
|
+
else
|
62
|
+
[docbook]
|
63
|
+
end
|
64
|
+
|
65
|
+
files.each do |file|
|
66
|
+
begin
|
67
|
+
ParasInMongo.file_in_mongo(file,options)
|
68
|
+
rescue
|
69
|
+
puts "error: #{file} import mongo failure!"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
puts "success: #{docbook} in mongo successfully!"
|
data/ebook_tools.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{ebook_tools}
|
5
|
-
s.version = '0.0.
|
5
|
+
s.version = '0.0.4'
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Aaron"]
|
9
|
-
s.date = %q{2013-04-
|
9
|
+
s.date = %q{2013-04-05}
|
10
10
|
s.description = %q{电子书工具集.}
|
11
11
|
s.email = %q{aaron@nonobo.com}
|
12
12
|
s.require_paths = ["lib"]
|
@@ -15,10 +15,12 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.has_rdoc = true
|
16
16
|
s.rdoc_options = ["--charset=UTF-8"]
|
17
17
|
s.executables << "ebook_tools"
|
18
|
+
s.executables << "para_import_mongo"
|
18
19
|
s.files = [
|
19
20
|
"README",
|
20
21
|
"CHANGELOG",
|
21
22
|
"bin/ebook_tools",
|
23
|
+
"bin/para_import_mongo",
|
22
24
|
"lib/ebook_tools.rb",
|
23
25
|
"lib/extract_book_struct.rb",
|
24
26
|
"lib/header_detect.rb",
|
@@ -26,6 +28,7 @@ Gem::Specification.new do |s|
|
|
26
28
|
"lib/txt.rb",
|
27
29
|
"lib/epub.rb",
|
28
30
|
"lib/utils.rb",
|
31
|
+
"lib/paras_in_mongo.rb",
|
29
32
|
"ebook_tools.gemspec"
|
30
33
|
]
|
31
34
|
s.add_dependency(%q<uuid>)
|
@@ -35,4 +38,5 @@ Gem::Specification.new do |s|
|
|
35
38
|
s.add_dependency(%q<pdf-reader>)
|
36
39
|
s.add_dependency(%q<nokogiri>)
|
37
40
|
s.add_dependency(%q<levenshtein>)
|
41
|
+
s.add_dependency(%q<mongo>)
|
38
42
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'mongo'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
include Mongo
|
6
|
+
module ParasInMongo
|
7
|
+
extend self
|
8
|
+
|
9
|
+
def file_in_mongo(filename,options={})
|
10
|
+
client = MongoClient.new(options[:host], options[:port])
|
11
|
+
db = client[options[:database]]
|
12
|
+
coll = db[options[:collection]]
|
13
|
+
|
14
|
+
doc = Nokogiri::XML(File.open(filename).read)
|
15
|
+
|
16
|
+
book_id = doc.search("book")[0]['id']
|
17
|
+
title = doc.search("book info title")[0].text
|
18
|
+
author = doc.search("book info author")[0].text
|
19
|
+
pubdate = doc.search("book info pubdate")[0].text
|
20
|
+
publisher = doc.search("book info publisher")[0].text
|
21
|
+
|
22
|
+
paras = doc.search("para[key=yes]")
|
23
|
+
|
24
|
+
source ={book: {title: title,book_id: book_id, author: author,pubdate: pubdate, publisher: publisher}}
|
25
|
+
|
26
|
+
paras.each do |para|
|
27
|
+
para_attrs = {'_id' => para['id']}
|
28
|
+
content = para.search("content")[0].text
|
29
|
+
|
30
|
+
keywords = []
|
31
|
+
para.search("keyword").each do |keyword|
|
32
|
+
keywords << {keyword: keyword.text, weight: keyword['weight'].to_i}
|
33
|
+
end
|
34
|
+
|
35
|
+
para_attrs = para_attrs.merge(keywords: keywords, content: content)
|
36
|
+
section = para.parent.search("info title").text
|
37
|
+
para_attrs = para_attrs.merge(source: source.merge(location: {section: section}))
|
38
|
+
|
39
|
+
coll.insert(para_attrs)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ebook_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: uuid
|
@@ -123,16 +123,34 @@ dependencies:
|
|
123
123
|
- - ! '>='
|
124
124
|
- !ruby/object:Gem::Version
|
125
125
|
version: '0'
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: mongo
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :runtime
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
126
142
|
description: 电子书工具集.
|
127
143
|
email: aaron@nonobo.com
|
128
144
|
executables:
|
129
145
|
- ebook_tools
|
146
|
+
- para_import_mongo
|
130
147
|
extensions: []
|
131
148
|
extra_rdoc_files: []
|
132
149
|
files:
|
133
150
|
- README
|
134
151
|
- CHANGELOG
|
135
152
|
- bin/ebook_tools
|
153
|
+
- bin/para_import_mongo
|
136
154
|
- lib/ebook_tools.rb
|
137
155
|
- lib/extract_book_struct.rb
|
138
156
|
- lib/header_detect.rb
|
@@ -140,6 +158,7 @@ files:
|
|
140
158
|
- lib/txt.rb
|
141
159
|
- lib/epub.rb
|
142
160
|
- lib/utils.rb
|
161
|
+
- lib/paras_in_mongo.rb
|
143
162
|
- ebook_tools.gemspec
|
144
163
|
homepage:
|
145
164
|
licenses: []
|