ebook_tools 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,7 @@
1
+ 0.1.2 2013.5.27
2
+ fix bug: 对没有检测到目录结构的书进行错误标注
3
+ new: 新增异步导入paras
4
+
1
5
  0.1.1 2013.5.26
2
6
  fix bug: 提取目录结构时文本内容开始部分存在全角空格而无法正确提取目录结构
3
7
  fix bug: 无法提取文本目录中包含“?”等标点符号的目录
data/bin/ebook_tools CHANGED
@@ -88,6 +88,7 @@ def help(command=nil)
88
88
  sed -i 's/^\(\s*[0-9]\{1,3\}\)./\1 /' <file> : 将 "1.xxx" 格式替换成 "1 xxx"
89
89
  iconv --from-code GB2312 -t UTF-8 -c -o <output file> <file> : 将GB2312格式文件转换成UTF-8
90
90
  sed -i 's/\s*第.\{1,3\}章.*[0-9]\{1,3\}$//g' <file> : 将 "第xxx章 xxxx1" 格式清除。主要用于有些pdf文档转换过来时的页眉信息。
91
+ sed -i 's/第\([0-9]\+\)节://g' <file> : 目录前面有第*节时需要清除
91
92
  EOF
92
93
  end
93
94
 
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ require 'rubygems'
4
+ require 'optparse'
5
+ require File.join(File.expand_path('../../',__FILE__),'lib','utils')
6
+ require File.join(File.expand_path('../../',__FILE__),'workers','para_import_worker')
7
+
8
+ def help
9
+ puts <<-EOF
10
+ usage:
11
+ para_import_scheduling [options] <docbook>
12
+
13
+ docbook: 已经标注重点段落的书文件或目录
14
+
15
+ options:
16
+ -H <host> , --host <host> : mongodb服务器,默认为localhost
17
+ -P <port> , --port <port> : mongodb服务器端口号,默认为27017 (Mongo默认端口号)
18
+ -D <database>, --database <database> : 重点段落要存放的数据库
19
+ -C <collection>, --collection <collection> : 重点段落存放的集合
20
+ EOF
21
+ exit
22
+ end
23
+
24
+
25
+ options = {:host=>'localhost',:port=>27017,:database=>'resource_development',:collection=>'paras'}
26
+ opts = OptionParser.new do |opts|
27
+ opts.on('-H host','--host host') do |host|
28
+ options[:host] = host
29
+ end
30
+
31
+ opts.on('-P port','--port port') do |port|
32
+ options[:port] = port.to_i
33
+ end
34
+
35
+ opts.on('-D database','--database database') do |database|
36
+ options[:database] = database
37
+ end
38
+
39
+ opts.on('-C collection','--collection collection') do |collection|
40
+ options[:collection] = collection
41
+ end
42
+
43
+ opts.on('-h','--help') do
44
+ help
45
+ end
46
+ end
47
+ opts.parse ARGV
48
+
49
+ docbook = ARGV[-1]
50
+
51
+ if docbook.nil?
52
+ help
53
+ end
54
+
55
+ unless File.exists?(docbook)
56
+ raise "错误:指定的docbook不存在。"
57
+ end
58
+
59
+ files = if File.directory?(docbook)
60
+ Utils.scan_file_from_dir(docbook,{:format=>'.xml'})
61
+ else
62
+ [docbook]
63
+ end
64
+
65
+ files.each do |file|
66
+ begin
67
+ #ParasInMongo.file_in_mongo(file,options)
68
+ ParaImportWorker.perform_async(file,options)
69
+ rescue
70
+ puts "error: #{file} to job failure!"
71
+ end
72
+ end
73
+ puts "success: #{docbook} to job successfully!"
data/ebook_tools.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{ebook_tools}
5
- s.version = '0.1.1'
5
+ s.version = '0.1.2'
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Aaron"]
@@ -23,6 +23,7 @@ Gem::Specification.new do |s|
23
23
  "bin/ebook_tools",
24
24
  "bin/para_import_mongo",
25
25
  "bin/doc_book_import_mongo",
26
+ "bin/para_import_scheduling",
26
27
  "bin/xml2json",
27
28
  "lib/ebook_tools.rb",
28
29
  "lib/txt_book.rb",
@@ -33,6 +34,7 @@ Gem::Specification.new do |s|
33
34
  "lib/utils.rb",
34
35
  "lib/paras_in_mongo.rb",
35
36
  "lib/doc_book_in_mongo.rb",
37
+ "workers/para_import_worker.rb",
36
38
  "ebook_tools.gemspec"
37
39
  ]
38
40
  s.add_dependency(%q<uuid>)
@@ -43,4 +45,5 @@ Gem::Specification.new do |s|
43
45
  s.add_dependency(%q<nokogiri>)
44
46
  s.add_dependency(%q<levenshtein>)
45
47
  s.add_dependency(%q<moped>)
48
+ s.add_dependency(%q<sidekiq>)
46
49
  end
data/lib/ebook_tools.rb CHANGED
@@ -246,8 +246,10 @@ module EbookTools
246
246
  if extract_book_struct_to_file(file,dest_file)
247
247
  puts "success: extract book struct successfully!"
248
248
  else
249
+ new_file = File.join(File.dirname(file),"[err]#{basename}#{extname}")
250
+ FileUtils.mv(file,new_file,:force=>true)
249
251
  puts "警告: 没有检测到书结构信息."
250
- end
252
+ end
251
253
  rescue Exception => e
252
254
  puts "error: #{file} \n#{e.backtrace.join("\n")}"
253
255
  end
@@ -1,13 +1,15 @@
1
1
  # encoding: UTF-8
2
2
  require 'moped'
3
3
  require 'nokogiri'
4
+ require 'active_support'
4
5
 
5
6
  module ParasInMongo
6
7
  extend self
7
8
 
8
9
  def file_in_mongo(filename,options={})
9
- session = Moped::Session.new([ "#{options[:host]}:#{options[:port]}" ])
10
- session.use options[:database]
10
+ options = options.stringify_keys
11
+ session = Moped::Session.new([ "#{options['host']}:#{options['port']}" ])
12
+ session.use options['database']
11
13
 
12
14
  doc = Nokogiri::XML(File.open(filename).read)
13
15
 
@@ -33,7 +35,7 @@ module ParasInMongo
33
35
  para_attrs = para_attrs.merge(keywords: keywords, content: content)
34
36
  section = para.parent.search("info title").text
35
37
  para_attrs = para_attrs.merge(source: source.merge(location: {section: section}))
36
- session[options[:collection]].insert(para_attrs)
38
+ session[options['collection']].insert(para_attrs)
37
39
  end
38
40
  end
39
41
  end
data/lib/utils.rb CHANGED
@@ -195,7 +195,7 @@ module Utils
195
195
  def scan_file_from_dir(dir,options={})
196
196
  files = []
197
197
  walk_dir(dir,options) do |file|
198
- files << file.to_s
198
+ files << file.realpath.to_s
199
199
  end
200
200
  files
201
201
  end
@@ -0,0 +1,22 @@
1
+ require 'sidekiq'
2
+ require File.join(File.expand_path('../../',__FILE__),'lib','paras_in_mongo')
3
+
4
+ # If your client is single-threaded, we just need a single connection in our Redis connection pool
5
+ Sidekiq.configure_client do |config|
6
+ config.redis = { :namespace => 'x', :size => 1, :url => 'redis://localhost:6379/14' }
7
+ end
8
+
9
+ # Sidekiq server is multi-threaded so our Redis connection pool size defaults to concurrency (-c)
10
+ Sidekiq.configure_server do |config|
11
+ config.redis = { :namespace => 'x', :url => 'redis://localhost:6379/14' }
12
+ end
13
+
14
+ class ParaImportWorker
15
+ include Sidekiq::Worker
16
+
17
+ def perform(filename,options={})
18
+ puts "Workin' #{filename}"
19
+ ParasInMongo.file_in_mongo(filename,options)
20
+ puts "#{filename} in mongo successfully"
21
+ end
22
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ebook_tools
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -139,6 +139,22 @@ dependencies:
139
139
  - - ! '>='
140
140
  - !ruby/object:Gem::Version
141
141
  version: '0'
142
+ - !ruby/object:Gem::Dependency
143
+ name: sidekiq
144
+ requirement: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ! '>='
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ type: :runtime
151
+ prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
142
158
  description: 电子书工具集.
143
159
  email: aaron@nonobo.com
144
160
  executables:
@@ -153,6 +169,7 @@ files:
153
169
  - bin/ebook_tools
154
170
  - bin/para_import_mongo
155
171
  - bin/doc_book_import_mongo
172
+ - bin/para_import_scheduling
156
173
  - bin/xml2json
157
174
  - lib/ebook_tools.rb
158
175
  - lib/txt_book.rb
@@ -163,6 +180,7 @@ files:
163
180
  - lib/utils.rb
164
181
  - lib/paras_in_mongo.rb
165
182
  - lib/doc_book_in_mongo.rb
183
+ - workers/para_import_worker.rb
166
184
  - ebook_tools.gemspec
167
185
  homepage:
168
186
  licenses: []