ebook_tools 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +4 -0
- data/bin/ebook_tools +1 -0
- data/bin/para_import_scheduling +73 -0
- data/ebook_tools.gemspec +4 -1
- data/lib/ebook_tools.rb +3 -1
- data/lib/paras_in_mongo.rb +5 -3
- data/lib/utils.rb +1 -1
- data/workers/para_import_worker.rb +22 -0
- metadata +19 -1
data/CHANGELOG
CHANGED
data/bin/ebook_tools
CHANGED
@@ -88,6 +88,7 @@ def help(command=nil)
|
|
88
88
|
sed -i 's/^\(\s*[0-9]\{1,3\}\)./\1 /' <file> : 将 "1.xxx" 格式替换成 "1 xxx"
|
89
89
|
iconv --from-code GB2312 -t UTF-8 -c -o <output file> <file> : 将GB2312格式文件转换成UTF-8
|
90
90
|
sed -i 's/\s*第.\{1,3\}章.*[0-9]\{1,3\}$//g' <file> : 将 "第xxx章 xxxx1" 格式清除。主要用于有些pdf文档转换过来时的页眉信息。
|
91
|
+
sed -i 's/第\([0-9]\+\)节://g' <file> : 目录前面有第*节时需要清除
|
91
92
|
EOF
|
92
93
|
end
|
93
94
|
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
require 'rubygems'
|
4
|
+
require 'optparse'
|
5
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','utils')
|
6
|
+
require File.join(File.expand_path('../../',__FILE__),'workers','para_import_worker')
|
7
|
+
|
8
|
+
def help
|
9
|
+
puts <<-EOF
|
10
|
+
usage:
|
11
|
+
para_import_scheduling [options] <docbook>
|
12
|
+
|
13
|
+
docbook: 已经标注重点段落的书文件或目录
|
14
|
+
|
15
|
+
options:
|
16
|
+
-H <host> , --host <host> : mongodb服务器,默认为localhost
|
17
|
+
-P <port> , --port <port> : mongodb服务器端口号,默认为27017 (Mongo默认端口号)
|
18
|
+
-D <database>, --database <database> : 重点段落要存放的数据库
|
19
|
+
-C <collection>, --collection <collection> : 重点段落存放的集合
|
20
|
+
EOF
|
21
|
+
exit
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
options = {:host=>'localhost',:port=>27017,:database=>'resource_development',:collection=>'paras'}
|
26
|
+
opts = OptionParser.new do |opts|
|
27
|
+
opts.on('-H host','--host host') do |host|
|
28
|
+
options[:host] = host
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on('-P port','--port port') do |port|
|
32
|
+
options[:port] = port.to_i
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on('-D database','--database database') do |database|
|
36
|
+
options[:database] = database
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on('-C collection','--collection collection') do |collection|
|
40
|
+
options[:collection] = collection
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on('-h','--help') do
|
44
|
+
help
|
45
|
+
end
|
46
|
+
end
|
47
|
+
opts.parse ARGV
|
48
|
+
|
49
|
+
docbook = ARGV[-1]
|
50
|
+
|
51
|
+
if docbook.nil?
|
52
|
+
help
|
53
|
+
end
|
54
|
+
|
55
|
+
unless File.exists?(docbook)
|
56
|
+
raise "错误:指定的docbook不存在。"
|
57
|
+
end
|
58
|
+
|
59
|
+
files = if File.directory?(docbook)
|
60
|
+
Utils.scan_file_from_dir(docbook,{:format=>'.xml'})
|
61
|
+
else
|
62
|
+
[docbook]
|
63
|
+
end
|
64
|
+
|
65
|
+
files.each do |file|
|
66
|
+
begin
|
67
|
+
#ParasInMongo.file_in_mongo(file,options)
|
68
|
+
ParaImportWorker.perform_async(file,options)
|
69
|
+
rescue
|
70
|
+
puts "error: #{file} to job failure!"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
puts "success: #{docbook} to job successfully!"
|
data/ebook_tools.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{ebook_tools}
|
5
|
-
s.version = '0.1.
|
5
|
+
s.version = '0.1.2'
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Aaron"]
|
@@ -23,6 +23,7 @@ Gem::Specification.new do |s|
|
|
23
23
|
"bin/ebook_tools",
|
24
24
|
"bin/para_import_mongo",
|
25
25
|
"bin/doc_book_import_mongo",
|
26
|
+
"bin/para_import_scheduling",
|
26
27
|
"bin/xml2json",
|
27
28
|
"lib/ebook_tools.rb",
|
28
29
|
"lib/txt_book.rb",
|
@@ -33,6 +34,7 @@ Gem::Specification.new do |s|
|
|
33
34
|
"lib/utils.rb",
|
34
35
|
"lib/paras_in_mongo.rb",
|
35
36
|
"lib/doc_book_in_mongo.rb",
|
37
|
+
"workers/para_import_worker.rb",
|
36
38
|
"ebook_tools.gemspec"
|
37
39
|
]
|
38
40
|
s.add_dependency(%q<uuid>)
|
@@ -43,4 +45,5 @@ Gem::Specification.new do |s|
|
|
43
45
|
s.add_dependency(%q<nokogiri>)
|
44
46
|
s.add_dependency(%q<levenshtein>)
|
45
47
|
s.add_dependency(%q<moped>)
|
48
|
+
s.add_dependency(%q<sidekiq>)
|
46
49
|
end
|
data/lib/ebook_tools.rb
CHANGED
@@ -246,8 +246,10 @@ module EbookTools
|
|
246
246
|
if extract_book_struct_to_file(file,dest_file)
|
247
247
|
puts "success: extract book struct successfully!"
|
248
248
|
else
|
249
|
+
new_file = File.join(File.dirname(file),"[err]#{basename}#{extname}")
|
250
|
+
FileUtils.mv(file,new_file,:force=>true)
|
249
251
|
puts "警告: 没有检测到书结构信息."
|
250
|
-
end
|
252
|
+
end
|
251
253
|
rescue Exception => e
|
252
254
|
puts "error: #{file} \n#{e.backtrace.join("\n")}"
|
253
255
|
end
|
data/lib/paras_in_mongo.rb
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
require 'moped'
|
3
3
|
require 'nokogiri'
|
4
|
+
require 'active_support'
|
4
5
|
|
5
6
|
module ParasInMongo
|
6
7
|
extend self
|
7
8
|
|
8
9
|
def file_in_mongo(filename,options={})
|
9
|
-
|
10
|
-
session.
|
10
|
+
options = options.stringify_keys
|
11
|
+
session = Moped::Session.new([ "#{options['host']}:#{options['port']}" ])
|
12
|
+
session.use options['database']
|
11
13
|
|
12
14
|
doc = Nokogiri::XML(File.open(filename).read)
|
13
15
|
|
@@ -33,7 +35,7 @@ module ParasInMongo
|
|
33
35
|
para_attrs = para_attrs.merge(keywords: keywords, content: content)
|
34
36
|
section = para.parent.search("info title").text
|
35
37
|
para_attrs = para_attrs.merge(source: source.merge(location: {section: section}))
|
36
|
-
session[options[
|
38
|
+
session[options['collection']].insert(para_attrs)
|
37
39
|
end
|
38
40
|
end
|
39
41
|
end
|
data/lib/utils.rb
CHANGED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'sidekiq'
|
2
|
+
require File.join(File.expand_path('../../',__FILE__),'lib','paras_in_mongo')
|
3
|
+
|
4
|
+
# If your client is single-threaded, we just need a single connection in our Redis connection pool
|
5
|
+
Sidekiq.configure_client do |config|
|
6
|
+
config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
|
7
|
+
end
|
8
|
+
|
9
|
+
# Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
|
10
|
+
Sidekiq.configure_server do |config|
|
11
|
+
config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
|
12
|
+
end
|
13
|
+
|
14
|
+
class ParaImportWorker
|
15
|
+
include Sidekiq::Worker
|
16
|
+
|
17
|
+
def perform(filename,options={})
|
18
|
+
puts "Workin' #{filename}"
|
19
|
+
ParasInMongo.file_in_mongo(filename,options)
|
20
|
+
puts "#{filename} in mongo successfully"
|
21
|
+
end
|
22
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ebook_tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -139,6 +139,22 @@ dependencies:
|
|
139
139
|
- - ! '>='
|
140
140
|
- !ruby/object:Gem::Version
|
141
141
|
version: '0'
|
142
|
+
- !ruby/object:Gem::Dependency
|
143
|
+
name: sidekiq
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
146
|
+
requirements:
|
147
|
+
- - ! '>='
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
type: :runtime
|
151
|
+
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ! '>='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
142
158
|
description: 电子书工具集.
|
143
159
|
email: aaron@nonobo.com
|
144
160
|
executables:
|
@@ -153,6 +169,7 @@ files:
|
|
153
169
|
- bin/ebook_tools
|
154
170
|
- bin/para_import_mongo
|
155
171
|
- bin/doc_book_import_mongo
|
172
|
+
- bin/para_import_scheduling
|
156
173
|
- bin/xml2json
|
157
174
|
- lib/ebook_tools.rb
|
158
175
|
- lib/txt_book.rb
|
@@ -163,6 +180,7 @@ files:
|
|
163
180
|
- lib/utils.rb
|
164
181
|
- lib/paras_in_mongo.rb
|
165
182
|
- lib/doc_book_in_mongo.rb
|
183
|
+
- workers/para_import_worker.rb
|
166
184
|
- ebook_tools.gemspec
|
167
185
|
homepage:
|
168
186
|
licenses: []
|