stevedore-uploader 1.0.11-java → 1.0.12-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/parsers/stevedore_email.rb +3 -3
- data/lib/stevedore-uploader.rb +63 -32
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f11e36b9388530683b93423de8d0fabf3997d46b
|
4
|
+
data.tar.gz: 68f2bbe57091826e63a0a4de0a39afff43c70a7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57e658b3a6ee1147563861165e96ba9346befd3f72b986ace6c4c2cdf5c590e267faa889f8ab72beea8a3aabb8a0e08093001990df39fae4d436d9b18d9622cc
|
7
|
+
data.tar.gz: 298762fd18cd429f3057eb3684510c518b11a6379a2fe692c31791e41fc9c1f08c13f8f9aea21486d18546e584f6f314c50b4180d7d00af31b7d3250da18e1e1
|
@@ -19,9 +19,9 @@ module Stevedore
|
|
19
19
|
t.message_from = metadata["Message-From"]
|
20
20
|
t.message_cc = metadata["Message-Cc"]
|
21
21
|
t.title = t.subject = metadata["subject"]
|
22
|
-
t.dkim_verified = begin
|
23
|
-
|
24
|
-
rescue
|
22
|
+
t.dkim_verified = filepath.end_with?("eml") && begin
|
23
|
+
DkimVerify::Verification::Verifier.new(open(filepath, 'r'){|f| f.read }).verify!
|
24
|
+
rescue DkimVerify::Verification::DkimError
|
25
25
|
false
|
26
26
|
end
|
27
27
|
t.attachments = metadata["X-Attachments"].to_s.split("|").map do |raw_attachment_filename|
|
data/lib/stevedore-uploader.rb
CHANGED
@@ -117,6 +117,31 @@ module Stevedore
|
|
117
117
|
resp
|
118
118
|
end
|
119
119
|
|
120
|
+
def self.ocr_pdf(filename)
|
121
|
+
pdf_basename = filename.gsub(".pdf", '')
|
122
|
+
ocred_pdf_filename = "#{pdf_basename}.ocr.pdf"
|
123
|
+
|
124
|
+
ret = system("convert","-monochrome","-density","300x300",filename,"-depth",'8',"#{pdf_basename}.png")
|
125
|
+
if ret.nil?
|
126
|
+
STDERR.puts "No imagemagick or ghostscript (or not on path); skipping OCR"
|
127
|
+
return nil
|
128
|
+
end
|
129
|
+
(Dir["#{pdf_basename}-*.png"] + Dir["#{pdf_basename}.png"]).sort_by{|png| (matchdata = png.match(/-\d+\.png/)).nil? ? 0 : matchdata[0].to_i }.each do |png|
|
130
|
+
ret = system('tesseract', png, png, "pdf", "")
|
131
|
+
if ret.nil?
|
132
|
+
STDERR.puts "No tesseract (or not on path); skipping OCR"
|
133
|
+
return nil
|
134
|
+
end
|
135
|
+
File.delete(png)
|
136
|
+
File.delete("#{png}.txt") rescue nil
|
137
|
+
end.join("\n\n")
|
138
|
+
# e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
|
139
|
+
files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 })
|
140
|
+
return nil if files.empty?
|
141
|
+
system('pdftk', *files, "cat", "output", ocred_pdf_filename)
|
142
|
+
ocred_pdf_filename
|
143
|
+
end
|
144
|
+
|
120
145
|
def process_document(filename, download_url)
|
121
146
|
begin
|
122
147
|
puts "begin to process #{filename}"
|
@@ -128,12 +153,11 @@ module Stevedore
|
|
128
153
|
metadata = "couldn't be parsed"
|
129
154
|
end
|
130
155
|
puts "parsed: #{content.size}"
|
131
|
-
if content.size > 3 * (10 ** 7)
|
156
|
+
if content.size > 3 * (10 ** 7) # 12071712
|
132
157
|
@errors << filename
|
133
158
|
puts "skipping #{filename} for being too big"
|
134
159
|
return nil
|
135
160
|
end
|
136
|
-
puts metadata["Content-Type"].inspect
|
137
161
|
|
138
162
|
# TODO: factor these out in favor of the yield/block situation down below.
|
139
163
|
# this should (eventually) be totally generic, but perhaps handle common
|
@@ -146,23 +170,14 @@ module Stevedore
|
|
146
170
|
when @should_ocr && metadata["Content-Type"] == "application/pdf" && (content.match(/\A\s*\z/) || content.size < 50 * metadata["xmpTPg:NPages"].to_i )
|
147
171
|
# this is a scanned PDF.
|
148
172
|
puts "scanned PDF #{File.basename(filename)} detected; OCRing"
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
File.delete(png)
|
154
|
-
# no need to use a system call when we could use the stdlib!
|
155
|
-
# system("rm", "-f", png) rescue nil
|
156
|
-
File.delete("#{png}.txt") rescue nil
|
157
|
-
end.join("\n\n")
|
158
|
-
# e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
|
159
|
-
files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 }) # 69420 is a random really big number, sorting those docs to the end.
|
160
|
-
if files.empty?
|
161
|
-
content = ''
|
173
|
+
|
174
|
+
ocred_pdf_filename = ESUploader.ocr_pdf(filename)
|
175
|
+
if ocred_pdf_filename
|
176
|
+
content, _ = Rika.parse_content_and_metadata(ocred_pdf_filename)
|
162
177
|
else
|
163
|
-
|
164
|
-
content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
|
178
|
+
content = ''
|
165
179
|
end
|
180
|
+
|
166
181
|
puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
|
167
182
|
::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
|
168
183
|
else
|
@@ -206,6 +221,30 @@ module Stevedore
|
|
206
221
|
end
|
207
222
|
end
|
208
223
|
|
224
|
+
def download_from_s3(dir, obj)
|
225
|
+
tmp_filename = File.join(dir, obj.key)
|
226
|
+
retried = false
|
227
|
+
begin
|
228
|
+
body = obj.get.body.read
|
229
|
+
File.open(tmp_filename, 'wb'){|f| f << body}
|
230
|
+
rescue Aws::S3::Errors::NoSuchKey, Aws::S3::Errors::Http503Error
|
231
|
+
@errors << obj.key
|
232
|
+
rescue ArgumentError
|
233
|
+
File.open(tmp_filename, 'wb'){|f| f << body.nil? ? '' : body.chars.select(&:valid_encoding?).join}
|
234
|
+
rescue Seahorse::Client::NetworkingError
|
235
|
+
unless retried
|
236
|
+
retried = true
|
237
|
+
retry
|
238
|
+
else
|
239
|
+
puts "looks like the network's down... press any key to continue once the network's back up"
|
240
|
+
gets
|
241
|
+
retry
|
242
|
+
end
|
243
|
+
end
|
244
|
+
tmp_filename
|
245
|
+
end
|
246
|
+
|
247
|
+
|
209
248
|
def do!(target_path, output_stream=STDOUT)
|
210
249
|
output_stream.puts "Processing documents from #{target_path}"
|
211
250
|
|
@@ -227,15 +266,7 @@ module Stevedore
|
|
227
266
|
slice_of_objs.map! do |obj|
|
228
267
|
next if obj.key[-1] == "/"
|
229
268
|
FileUtils.mkdir_p(File.join(dir, File.dirname(obj.key)))
|
230
|
-
tmp_filename =
|
231
|
-
begin
|
232
|
-
body = obj.get.body.read
|
233
|
-
File.open(tmp_filename, 'wb'){|f| f << body}
|
234
|
-
rescue Aws::S3::Errors::NoSuchKey
|
235
|
-
@errors << obj.key
|
236
|
-
rescue ArgumentError
|
237
|
-
File.open(tmp_filename, 'wb'){|f| f << body.nil? ? '' : body.chars.select(&:valid_encoding?).join}
|
238
|
-
end
|
269
|
+
tmp_filename = download_from_s3(dir, obj)
|
239
270
|
download_filename = "https://#{@s3_bucket}.s3.amazonaws.com/" + obj.key
|
240
271
|
|
241
272
|
# is this file an archive that contains a bunch of documents we should index separately?
|
@@ -274,11 +305,11 @@ module Stevedore
|
|
274
305
|
puts resp.inspect if resp && resp["errors"]
|
275
306
|
rescue Manticore::Timeout, Manticore::SocketException
|
276
307
|
output_stream.puts("retrying at #{Time.now}")
|
277
|
-
if retry_count <
|
308
|
+
if retry_count < 3
|
278
309
|
retry_count += 1
|
279
310
|
retry
|
280
311
|
else
|
281
|
-
@errors
|
312
|
+
@errors += slice_of_objs.map &:key
|
282
313
|
end
|
283
314
|
end
|
284
315
|
output_stream.puts "uploaded #{slice_of_objs.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
|
@@ -322,7 +353,6 @@ module Stevedore
|
|
322
353
|
|
323
354
|
yield doc, filename, content, metadata if block_given?
|
324
355
|
# FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
|
325
|
-
puts doc.inspect
|
326
356
|
doc
|
327
357
|
end
|
328
358
|
else
|
@@ -340,11 +370,12 @@ module Stevedore
|
|
340
370
|
output_stream.puts "Upload error: #{e} #{e.message}."
|
341
371
|
output_stream.puts e.backtrace.join("\n") + "\n\n\n"
|
342
372
|
output_stream.puts("retrying at #{Time.now}")
|
343
|
-
if retry_count <
|
344
|
-
retry_count += 1
|
373
|
+
if retry_count < 3
|
374
|
+
retry_count += 1
|
375
|
+
sleep 30
|
345
376
|
retry
|
346
377
|
else
|
347
|
-
@errors <<
|
378
|
+
@errors << slice_of_files
|
348
379
|
end
|
349
380
|
end
|
350
381
|
output_stream.puts "uploaded #{slice_of_files.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stevedore-uploader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.12
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|