ebook_tools 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +2 -2
- data/bin/ebook_tools +5 -3
- data/bin/para_import_mongo +72 -0
- data/ebook_tools.gemspec +6 -2
- data/lib/paras_in_mongo.rb +42 -0
- metadata +21 -2
data/CHANGELOG
CHANGED
data/bin/ebook_tools
CHANGED
@@ -82,10 +82,12 @@ def help(command=nil)
|
|
82
82
|
batch_convert: 批量转换指定目录中的文件为epub格式文件,并存放到目标目录
|
83
83
|
batch_extract: 批量提取指定目录中文件的书结构信息,并生成Docbook存放到目标目录
|
84
84
|
|
85
|
-
适用对象要求:
|
86
|
-
编码格式为utf-8
|
87
|
-
|
88
85
|
具体命令的更多信息请通过'ebook_tools help <command>'查看。
|
86
|
+
|
87
|
+
其他命令:
|
88
|
+
sed -i 's/^\(\s*[0-9]\{1,3\}\)./\1 /' <file> : 将 "1.xxx" 格式替换成 "1 xxx"
|
89
|
+
iconv --from-code GB2312 -t UTF-8 -c -o <output file> <file> : 将GB2312格式文件转换成UTF-8
|
90
|
+
sed -i 's/\s*第.\{1,3\}章.*[0-9]\{1,3\}$//g' <file> : 将 "第xxx章 xxxx1" 格式清除。主要用于有些pdf文档转换过来时的页眉信息。
|
89
91
|
EOF
|
90
92
|
end
|
91
93
|
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
require 'rubygems'
|
4
|
+
require 'optparse'
|
5
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','utils')
|
6
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','paras_in_mongo')
|
7
|
+
|
8
|
+
def help
|
9
|
+
puts <<-EOF
|
10
|
+
usage:
|
11
|
+
para_import_mongo [options] <docbook>
|
12
|
+
|
13
|
+
docbook: 已经标注重点段落的书文件或目录
|
14
|
+
|
15
|
+
options:
|
16
|
+
-H <host> , --host <host> : mongodb服务器,默认为localhost
|
17
|
+
-P <port> , --port <port> : mongodb服务器端口号,默认为27017 (Mongo默认端口号)
|
18
|
+
-D <database>, --database <database> : 重点段落要存放的数据库
|
19
|
+
-C <collection>, --collection <collection> : 重点段落存放的集合
|
20
|
+
EOF
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
options = {:host=>'localhost',:port=>27017,:database=>'resource_development',:collection=>'paras'}
|
26
|
+
opts = OptionParser.new do |opts|
|
27
|
+
opts.on('-H host','--host host') do |host|
|
28
|
+
options[:host] = host
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on('-P port','--port port') do |port|
|
32
|
+
options[:port] = port.to_i
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on('-D database','--database database') do |database|
|
36
|
+
options[:database] = database
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on('-C collection','--collection collection') do |collection|
|
40
|
+
options[:collection] = collection
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('-h','--help') do
|
44
|
+
help
|
45
|
+
end
|
46
|
+
end
|
47
|
+
opts.parse ARGV
|
48
|
+
|
49
|
+
docbook = ARGV[-1]
|
50
|
+
|
51
|
+
if docbook.nil?
|
52
|
+
help
|
53
|
+
end
|
54
|
+
|
55
|
+
unless File.exists?(docbook)
|
56
|
+
raise "错误:指定的docbook不存在。"
|
57
|
+
end
|
58
|
+
|
59
|
+
files = if File.directory?(docbook)
|
60
|
+
Utils.scan_file_from_dir(docbook,{:format=>'.xml'})
|
61
|
+
else
|
62
|
+
[docbook]
|
63
|
+
end
|
64
|
+
|
65
|
+
files.each do |file|
|
66
|
+
begin
|
67
|
+
ParasInMongo.file_in_mongo(file,options)
|
68
|
+
rescue
|
69
|
+
puts "error: #{file} import mongo failure!"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
puts "success: #{docbook} in mongo successfully!"
|
data/ebook_tools.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{ebook_tools}
|
5
|
-
s.version = '0.0.
|
5
|
+
s.version = '0.0.4'
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Aaron"]
|
9
|
-
s.date = %q{2013-04-
|
9
|
+
s.date = %q{2013-04-05}
|
10
10
|
s.description = %q{电子书工具集.}
|
11
11
|
s.email = %q{aaron@nonobo.com}
|
12
12
|
s.require_paths = ["lib"]
|
@@ -15,10 +15,12 @@ Gem::Specification.new do |s|
|
|
15
15
|
s.has_rdoc = true
|
16
16
|
s.rdoc_options = ["--charset=UTF-8"]
|
17
17
|
s.executables << "ebook_tools"
|
18
|
+
s.executables << "para_import_mongo"
|
18
19
|
s.files = [
|
19
20
|
"README",
|
20
21
|
"CHANGELOG",
|
21
22
|
"bin/ebook_tools",
|
23
|
+
"bin/para_import_mongo",
|
22
24
|
"lib/ebook_tools.rb",
|
23
25
|
"lib/extract_book_struct.rb",
|
24
26
|
"lib/header_detect.rb",
|
@@ -26,6 +28,7 @@ Gem::Specification.new do |s|
|
|
26
28
|
"lib/txt.rb",
|
27
29
|
"lib/epub.rb",
|
28
30
|
"lib/utils.rb",
|
31
|
+
"lib/paras_in_mongo.rb",
|
29
32
|
"ebook_tools.gemspec"
|
30
33
|
]
|
31
34
|
s.add_dependency(%q<uuid>)
|
@@ -35,4 +38,5 @@ Gem::Specification.new do |s|
|
|
35
38
|
s.add_dependency(%q<pdf-reader>)
|
36
39
|
s.add_dependency(%q<nokogiri>)
|
37
40
|
s.add_dependency(%q<levenshtein>)
|
41
|
+
s.add_dependency(%q<mongo>)
|
38
42
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'mongo'
|
3
|
+
require 'nokogiri'
|
4
|
+
|
5
|
+
include Mongo
|
6
|
+
module ParasInMongo
|
7
|
+
extend self
|
8
|
+
|
9
|
+
def file_in_mongo(filename,options={})
|
10
|
+
client = MongoClient.new(options[:host], options[:port])
|
11
|
+
db = client[options[:database]]
|
12
|
+
coll = db[options[:collection]]
|
13
|
+
|
14
|
+
doc = Nokogiri::XML(File.open(filename).read)
|
15
|
+
|
16
|
+
book_id = doc.search("book")[0]['id']
|
17
|
+
title = doc.search("book info title")[0].text
|
18
|
+
author = doc.search("book info author")[0].text
|
19
|
+
pubdate = doc.search("book info pubdate")[0].text
|
20
|
+
publisher = doc.search("book info publisher")[0].text
|
21
|
+
|
22
|
+
paras = doc.search("para[key=yes]")
|
23
|
+
|
24
|
+
source ={book: {title: title,book_id: book_id, author: author,pubdate: pubdate, publisher: publisher}}
|
25
|
+
|
26
|
+
paras.each do |para|
|
27
|
+
para_attrs = {'_id' => para['id']}
|
28
|
+
content = para.search("content")[0].text
|
29
|
+
|
30
|
+
keywords = []
|
31
|
+
para.search("keyword").each do |keyword|
|
32
|
+
keywords << {keyword: keyword.text, weight: keyword['weight'].to_i}
|
33
|
+
end
|
34
|
+
|
35
|
+
para_attrs = para_attrs.merge(keywords: keywords, content: content)
|
36
|
+
section = para.parent.search("info title").text
|
37
|
+
para_attrs = para_attrs.merge(source: source.merge(location: {section: section}))
|
38
|
+
|
39
|
+
coll.insert(para_attrs)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ebook_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: uuid
|
@@ -123,16 +123,34 @@ dependencies:
|
|
123
123
|
- - ! '>='
|
124
124
|
- !ruby/object:Gem::Version
|
125
125
|
version: '0'
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: mongo
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :runtime
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
126
142
|
description: 电子书工具集.
|
127
143
|
email: aaron@nonobo.com
|
128
144
|
executables:
|
129
145
|
- ebook_tools
|
146
|
+
- para_import_mongo
|
130
147
|
extensions: []
|
131
148
|
extra_rdoc_files: []
|
132
149
|
files:
|
133
150
|
- README
|
134
151
|
- CHANGELOG
|
135
152
|
- bin/ebook_tools
|
153
|
+
- bin/para_import_mongo
|
136
154
|
- lib/ebook_tools.rb
|
137
155
|
- lib/extract_book_struct.rb
|
138
156
|
- lib/header_detect.rb
|
@@ -140,6 +158,7 @@ files:
|
|
140
158
|
- lib/txt.rb
|
141
159
|
- lib/epub.rb
|
142
160
|
- lib/utils.rb
|
161
|
+
- lib/paras_in_mongo.rb
|
143
162
|
- ebook_tools.gemspec
|
144
163
|
homepage:
|
145
164
|
licenses: []
|