stevedore-uploader 1.0.11-java → 1.0.12-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/parsers/stevedore_email.rb +3 -3
- data/lib/stevedore-uploader.rb +63 -32
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f11e36b9388530683b93423de8d0fabf3997d46b
|
4
|
+
data.tar.gz: 68f2bbe57091826e63a0a4de0a39afff43c70a7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57e658b3a6ee1147563861165e96ba9346befd3f72b986ace6c4c2cdf5c590e267faa889f8ab72beea8a3aabb8a0e08093001990df39fae4d436d9b18d9622cc
|
7
|
+
data.tar.gz: 298762fd18cd429f3057eb3684510c518b11a6379a2fe692c31791e41fc9c1f08c13f8f9aea21486d18546e584f6f314c50b4180d7d00af31b7d3250da18e1e1
|
@@ -19,9 +19,9 @@ module Stevedore
|
|
19
19
|
t.message_from = metadata["Message-From"]
|
20
20
|
t.message_cc = metadata["Message-Cc"]
|
21
21
|
t.title = t.subject = metadata["subject"]
|
22
|
-
t.dkim_verified = begin
|
23
|
-
|
24
|
-
rescue
|
22
|
+
t.dkim_verified = filepath.end_with?("eml") && begin
|
23
|
+
DkimVerify::Verification::Verifier.new(open(filepath, 'r'){|f| f.read }).verify!
|
24
|
+
rescue DkimVerify::Verification::DkimError
|
25
25
|
false
|
26
26
|
end
|
27
27
|
t.attachments = metadata["X-Attachments"].to_s.split("|").map do |raw_attachment_filename|
|
data/lib/stevedore-uploader.rb
CHANGED
@@ -117,6 +117,31 @@ module Stevedore
|
|
117
117
|
resp
|
118
118
|
end
|
119
119
|
|
120
|
+
def self.ocr_pdf(filename)
|
121
|
+
pdf_basename = filename.gsub(".pdf", '')
|
122
|
+
ocred_pdf_filename = "#{pdf_basename}.ocr.pdf"
|
123
|
+
|
124
|
+
ret = system("convert","-monochrome","-density","300x300",filename,"-depth",'8',"#{pdf_basename}.png")
|
125
|
+
if ret.nil?
|
126
|
+
STDERR.puts "No imagemagick or ghostscript (or not on path); skipping OCR"
|
127
|
+
return nil
|
128
|
+
end
|
129
|
+
(Dir["#{pdf_basename}-*.png"] + Dir["#{pdf_basename}.png"]).sort_by{|png| (matchdata = png.match(/-\d+\.png/)).nil? ? 0 : matchdata[0].to_i }.each do |png|
|
130
|
+
ret = system('tesseract', png, png, "pdf", "")
|
131
|
+
if ret.nil?
|
132
|
+
STDERR.puts "No tesseract (or not on path); skipping OCR"
|
133
|
+
return nil
|
134
|
+
end
|
135
|
+
File.delete(png)
|
136
|
+
File.delete("#{png}.txt") rescue nil
|
137
|
+
end.join("\n\n")
|
138
|
+
# e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
|
139
|
+
files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 })
|
140
|
+
return nil if files.empty?
|
141
|
+
system('pdftk', *files, "cat", "output", ocred_pdf_filename)
|
142
|
+
ocred_pdf_filename
|
143
|
+
end
|
144
|
+
|
120
145
|
def process_document(filename, download_url)
|
121
146
|
begin
|
122
147
|
puts "begin to process #{filename}"
|
@@ -128,12 +153,11 @@ module Stevedore
|
|
128
153
|
metadata = "couldn't be parsed"
|
129
154
|
end
|
130
155
|
puts "parsed: #{content.size}"
|
131
|
-
if content.size > 3 * (10 ** 7)
|
156
|
+
if content.size > 3 * (10 ** 7) # 12071712
|
132
157
|
@errors << filename
|
133
158
|
puts "skipping #{filename} for being too big"
|
134
159
|
return nil
|
135
160
|
end
|
136
|
-
puts metadata["Content-Type"].inspect
|
137
161
|
|
138
162
|
# TODO: factor these out in favor of the yield/block situation down below.
|
139
163
|
# this should (eventually) be totally generic, but perhaps handle common
|
@@ -146,23 +170,14 @@ module Stevedore
|
|
146
170
|
when @should_ocr && metadata["Content-Type"] == "application/pdf" && (content.match(/\A\s*\z/) || content.size < 50 * metadata["xmpTPg:NPages"].to_i )
|
147
171
|
# this is a scanned PDF.
|
148
172
|
puts "scanned PDF #{File.basename(filename)} detected; OCRing"
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
File.delete(png)
|
154
|
-
# no need to use a system call when we could use the stdlib!
|
155
|
-
# system("rm", "-f", png) rescue nil
|
156
|
-
File.delete("#{png}.txt") rescue nil
|
157
|
-
end.join("\n\n")
|
158
|
-
# e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
|
159
|
-
files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 }) # 69420 is a random really big number, sorting those docs to the end.
|
160
|
-
if files.empty?
|
161
|
-
content = ''
|
173
|
+
|
174
|
+
ocred_pdf_filename = ESUploader.ocr_pdf(filename)
|
175
|
+
if ocred_pdf_filename
|
176
|
+
content, _ = Rika.parse_content_and_metadata(ocred_pdf_filename)
|
162
177
|
else
|
163
|
-
|
164
|
-
content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
|
178
|
+
content = ''
|
165
179
|
end
|
180
|
+
|
166
181
|
puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
|
167
182
|
::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
|
168
183
|
else
|
@@ -206,6 +221,30 @@ module Stevedore
|
|
206
221
|
end
|
207
222
|
end
|
208
223
|
|
224
|
+
def download_from_s3(dir, obj)
|
225
|
+
tmp_filename = File.join(dir, obj.key)
|
226
|
+
retried = false
|
227
|
+
begin
|
228
|
+
body = obj.get.body.read
|
229
|
+
File.open(tmp_filename, 'wb'){|f| f << body}
|
230
|
+
rescue Aws::S3::Errors::NoSuchKey, Aws::S3::Errors::Http503Error
|
231
|
+
@errors << obj.key
|
232
|
+
rescue ArgumentError
|
233
|
+
File.open(tmp_filename, 'wb'){|f| f << body.nil? ? '' : body.chars.select(&:valid_encoding?).join}
|
234
|
+
rescue Seahorse::Client::NetworkingError
|
235
|
+
unless retried
|
236
|
+
retried = true
|
237
|
+
retry
|
238
|
+
else
|
239
|
+
puts "looks like the network's down... press any key to continue once the network's back up"
|
240
|
+
gets
|
241
|
+
retry
|
242
|
+
end
|
243
|
+
end
|
244
|
+
tmp_filename
|
245
|
+
end
|
246
|
+
|
247
|
+
|
209
248
|
def do!(target_path, output_stream=STDOUT)
|
210
249
|
output_stream.puts "Processing documents from #{target_path}"
|
211
250
|
|
@@ -227,15 +266,7 @@ module Stevedore
|
|
227
266
|
slice_of_objs.map! do |obj|
|
228
267
|
next if obj.key[-1] == "/"
|
229
268
|
FileUtils.mkdir_p(File.join(dir, File.dirname(obj.key)))
|
230
|
-
tmp_filename =
|
231
|
-
begin
|
232
|
-
body = obj.get.body.read
|
233
|
-
File.open(tmp_filename, 'wb'){|f| f << body}
|
234
|
-
rescue Aws::S3::Errors::NoSuchKey
|
235
|
-
@errors << obj.key
|
236
|
-
rescue ArgumentError
|
237
|
-
File.open(tmp_filename, 'wb'){|f| f << body.nil? ? '' : body.chars.select(&:valid_encoding?).join}
|
238
|
-
end
|
269
|
+
tmp_filename = download_from_s3(dir, obj)
|
239
270
|
download_filename = "https://#{@s3_bucket}.s3.amazonaws.com/" + obj.key
|
240
271
|
|
241
272
|
# is this file an archive that contains a bunch of documents we should index separately?
|
@@ -274,11 +305,11 @@ module Stevedore
|
|
274
305
|
puts resp.inspect if resp && resp["errors"]
|
275
306
|
rescue Manticore::Timeout, Manticore::SocketException
|
276
307
|
output_stream.puts("retrying at #{Time.now}")
|
277
|
-
if retry_count <
|
308
|
+
if retry_count < 3
|
278
309
|
retry_count += 1
|
279
310
|
retry
|
280
311
|
else
|
281
|
-
@errors
|
312
|
+
@errors += slice_of_objs.map &:key
|
282
313
|
end
|
283
314
|
end
|
284
315
|
output_stream.puts "uploaded #{slice_of_objs.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
|
@@ -322,7 +353,6 @@ module Stevedore
|
|
322
353
|
|
323
354
|
yield doc, filename, content, metadata if block_given?
|
324
355
|
# FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
|
325
|
-
puts doc.inspect
|
326
356
|
doc
|
327
357
|
end
|
328
358
|
else
|
@@ -340,11 +370,12 @@ module Stevedore
|
|
340
370
|
output_stream.puts "Upload error: #{e} #{e.message}."
|
341
371
|
output_stream.puts e.backtrace.join("\n") + "\n\n\n"
|
342
372
|
output_stream.puts("retrying at #{Time.now}")
|
343
|
-
if retry_count <
|
344
|
-
retry_count += 1
|
373
|
+
if retry_count < 3
|
374
|
+
retry_count += 1
|
375
|
+
sleep 30
|
345
376
|
retry
|
346
377
|
else
|
347
|
-
@errors <<
|
378
|
+
@errors << slice_of_files
|
348
379
|
end
|
349
380
|
end
|
350
381
|
output_stream.puts "uploaded #{slice_of_files.size} files to #{@es_index}; #{docs_so_far} uploaded so far"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stevedore-uploader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.12
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|