ebook_tools 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +4 -0
- data/bin/ebook_tools +1 -0
- data/bin/para_import_scheduling +73 -0
- data/ebook_tools.gemspec +4 -1
- data/lib/ebook_tools.rb +3 -1
- data/lib/paras_in_mongo.rb +5 -3
- data/lib/utils.rb +1 -1
- data/workers/para_import_worker.rb +22 -0
- metadata +19 -1
data/CHANGELOG
CHANGED
data/bin/ebook_tools
CHANGED
@@ -88,6 +88,7 @@ def help(command=nil)
|
|
88
88
|
sed -i 's/^\(\s*[0-9]\{1,3\}\)./\1 /' <file> : 将 "1.xxx" 格式替换成 "1 xxx"
|
89
89
|
iconv --from-code GB2312 -t UTF-8 -c -o <output file> <file> : 将GB2312格式文件转换成UTF-8
|
90
90
|
sed -i 's/\s*第.\{1,3\}章.*[0-9]\{1,3\}$//g' <file> : 将 "第xxx章 xxxx1" 格式清除。主要用于有些pdf文档转换过来时的页眉信息。
|
91
|
+
sed -i 's/第\([0-9]\+\)节://g' <file> : 目录前面有第*节时需要清除
|
91
92
|
EOF
|
92
93
|
end
|
93
94
|
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
require 'rubygems'
|
4
|
+
require 'optparse'
|
5
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','utils')
|
6
|
+
require File.join(File.expand_path('../../',__FILE__),'workers','para_import_worker')
|
7
|
+
|
8
|
+
def help
|
9
|
+
puts <<-EOF
|
10
|
+
usage:
|
11
|
+
para_import_scheduling [options] <docbook>
|
12
|
+
|
13
|
+
docbook: 已经标注重点段落的书文件或目录
|
14
|
+
|
15
|
+
options:
|
16
|
+
-H <host> , --host <host> : mongodb服务器,默认为localhost
|
17
|
+
-P <port> , --port <port> : mongodb服务器端口号,默认为27017 (Mongo默认端口号)
|
18
|
+
-D <database>, --database <database> : 重点段落要存放的数据库
|
19
|
+
-C <collection>, --collection <collection> : 重点段落存放的集合
|
20
|
+
EOF
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
options = {:host=>'localhost',:port=>27017,:database=>'resource_development',:collection=>'paras'}
|
26
|
+
opts = OptionParser.new do |opts|
|
27
|
+
opts.on('-H host','--host host') do |host|
|
28
|
+
options[:host] = host
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on('-P port','--port port') do |port|
|
32
|
+
options[:port] = port.to_i
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on('-D database','--database database') do |database|
|
36
|
+
options[:database] = database
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on('-C collection','--collection collection') do |collection|
|
40
|
+
options[:collection] = collection
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('-h','--help') do
|
44
|
+
help
|
45
|
+
end
|
46
|
+
end
|
47
|
+
opts.parse ARGV
|
48
|
+
|
49
|
+
docbook = ARGV[-1]
|
50
|
+
|
51
|
+
if docbook.nil?
|
52
|
+
help
|
53
|
+
end
|
54
|
+
|
55
|
+
unless File.exists?(docbook)
|
56
|
+
raise "错误:指定的docbook不存在。"
|
57
|
+
end
|
58
|
+
|
59
|
+
files = if File.directory?(docbook)
|
60
|
+
Utils.scan_file_from_dir(docbook,{:format=>'.xml'})
|
61
|
+
else
|
62
|
+
[docbook]
|
63
|
+
end
|
64
|
+
|
65
|
+
files.each do |file|
|
66
|
+
begin
|
67
|
+
#ParasInMongo.file_in_mongo(file,options)
|
68
|
+
ParaImportWorker.perform_async(file,options)
|
69
|
+
rescue
|
70
|
+
puts "error: #{file} to job failure!"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
puts "success: #{docbook} to job successfully!"
|
data/ebook_tools.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{ebook_tools}
|
5
|
-
s.version = '0.1.
|
5
|
+
s.version = '0.1.2'
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Aaron"]
|
@@ -23,6 +23,7 @@ Gem::Specification.new do |s|
|
|
23
23
|
"bin/ebook_tools",
|
24
24
|
"bin/para_import_mongo",
|
25
25
|
"bin/doc_book_import_mongo",
|
26
|
+
"bin/para_import_scheduling",
|
26
27
|
"bin/xml2json",
|
27
28
|
"lib/ebook_tools.rb",
|
28
29
|
"lib/txt_book.rb",
|
@@ -33,6 +34,7 @@ Gem::Specification.new do |s|
|
|
33
34
|
"lib/utils.rb",
|
34
35
|
"lib/paras_in_mongo.rb",
|
35
36
|
"lib/doc_book_in_mongo.rb",
|
37
|
+
"workers/para_import_worker.rb",
|
36
38
|
"ebook_tools.gemspec"
|
37
39
|
]
|
38
40
|
s.add_dependency(%q<uuid>)
|
@@ -43,4 +45,5 @@ Gem::Specification.new do |s|
|
|
43
45
|
s.add_dependency(%q<nokogiri>)
|
44
46
|
s.add_dependency(%q<levenshtein>)
|
45
47
|
s.add_dependency(%q<moped>)
|
48
|
+
s.add_dependency(%q<sidekiq>)
|
46
49
|
end
|
data/lib/ebook_tools.rb
CHANGED
@@ -246,8 +246,10 @@ module EbookTools
|
|
246
246
|
if extract_book_struct_to_file(file,dest_file)
|
247
247
|
puts "success: extract book struct successfully!"
|
248
248
|
else
|
249
|
+
new_file = File.join(File.dirname(file),"[err]#{basename}#{extname}")
|
250
|
+
FileUtils.mv(file,new_file,:force=>true)
|
249
251
|
puts "警告: 没有检测到书结构信息."
|
250
|
-
end
|
252
|
+
end
|
251
253
|
rescue Exception => e
|
252
254
|
puts "error: #{file} \n#{e.backtrace.join("\n")}"
|
253
255
|
end
|
data/lib/paras_in_mongo.rb
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
require 'moped'
|
3
3
|
require 'nokogiri'
|
4
|
+
require 'active_support'
|
4
5
|
|
5
6
|
module ParasInMongo
|
6
7
|
extend self
|
7
8
|
|
8
9
|
def file_in_mongo(filename,options={})
|
9
|
-
|
10
|
-
session.
|
10
|
+
options = options.stringify_keys
|
11
|
+
session = Moped::Session.new([ "#{options['host']}:#{options['port']}" ])
|
12
|
+
session.use options['database']
|
11
13
|
|
12
14
|
doc = Nokogiri::XML(File.open(filename).read)
|
13
15
|
|
@@ -33,7 +35,7 @@ module ParasInMongo
|
|
33
35
|
para_attrs = para_attrs.merge(keywords: keywords, content: content)
|
34
36
|
section = para.parent.search("info title").text
|
35
37
|
para_attrs = para_attrs.merge(source: source.merge(location: {section: section}))
|
36
|
-
session[options[
|
38
|
+
session[options['collection']].insert(para_attrs)
|
37
39
|
end
|
38
40
|
end
|
39
41
|
end
|
data/lib/utils.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','paras_in_mongo')
|
3
|
+
|
4
|
+
# If your client is single-threaded, we just need a single connection in our Redis connection pool
|
5
|
+
Sidekiq.configure_client do |config|
|
6
|
+
config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
|
7
|
+
end
|
8
|
+
|
9
|
+
# Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
|
10
|
+
Sidekiq.configure_server do |config|
|
11
|
+
config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
|
12
|
+
end
|
13
|
+
|
14
|
+
class ParaImportWorker
|
15
|
+
include Sidekiq::Worker
|
16
|
+
|
17
|
+
def perform(filename,options={})
|
18
|
+
puts "Workin' #{filename}"
|
19
|
+
ParasInMongo.file_in_mongo(filename,options)
|
20
|
+
puts "#{filename} in mongo successfully"
|
21
|
+
end
|
22
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ebook_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -139,6 +139,22 @@ dependencies:
|
|
139
139
|
- - ! '>='
|
140
140
|
- !ruby/object:Gem::Version
|
141
141
|
version: '0'
|
142
|
+
- !ruby/object:Gem::Dependency
|
143
|
+
name: sidekiq
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
146
|
+
requirements:
|
147
|
+
- - ! '>='
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
type: :runtime
|
151
|
+
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
142
158
|
description: 电子书工具集.
|
143
159
|
email: aaron@nonobo.com
|
144
160
|
executables:
|
@@ -153,6 +169,7 @@ files:
|
|
153
169
|
- bin/ebook_tools
|
154
170
|
- bin/para_import_mongo
|
155
171
|
- bin/doc_book_import_mongo
|
172
|
+
- bin/para_import_scheduling
|
156
173
|
- bin/xml2json
|
157
174
|
- lib/ebook_tools.rb
|
158
175
|
- lib/txt_book.rb
|
@@ -163,6 +180,7 @@ files:
|
|
163
180
|
- lib/utils.rb
|
164
181
|
- lib/paras_in_mongo.rb
|
165
182
|
- lib/doc_book_in_mongo.rb
|
183
|
+
- workers/para_import_worker.rb
|
166
184
|
- ebook_tools.gemspec
|
167
185
|
homepage:
|
168
186
|
licenses: []
|