stevedore-uploader 1.0.7-java → 1.0.9-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/lib/parsers/stevedore_blob.rb +9 -5
- data/lib/parsers/stevedore_csv_row.rb +1 -1
- data/lib/parsers/stevedore_email.rb +11 -2
- data/lib/stevedore-uploader.rb +23 -8
- metadata +33 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d46b0968e5b625a0364cd9e165930b9f0ea0849b
|
4
|
+
data.tar.gz: 1f42e1aac685f31206f33fc6c7a68f06a94e8ab3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: aa15c8bf07c5f1410396892e54b369dff1b5df14cf3521e02c7da449db84373f24c508bff3a9a28944741f8b4fb760505802b4975f62fb5aadffa516a29b6a5d
|
7
|
+
data.tar.gz: dd4c87fc77d0156dfca7324182d1f741e1babdf8320d499209d017b8b9061349f19220d5804b725225d9c794cbd2cde22bfb617a0d054fc025691368a69816b6
|
data/README.md
CHANGED
@@ -17,7 +17,7 @@ module Stevedore
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def self.new_from_tika(content, metadata, download_url, filename)
|
20
|
-
self.new(metadata["title"]
|
20
|
+
self.new( ((metadata["title"] && metadata["title"] != "Untitled") ? metadata["title"] : File.basename(filename)), content, download_url)
|
21
21
|
end
|
22
22
|
|
23
23
|
def analyze!
|
@@ -26,12 +26,16 @@ module Stevedore
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def to_hash
|
29
|
+
sha = Digest::SHA1.hexdigest(download_url)
|
30
|
+
# TODO should merge in or something?
|
29
31
|
{
|
30
|
-
"sha1" =>
|
31
|
-
"
|
32
|
+
"sha1" => sha,
|
33
|
+
"id" => sha,
|
34
|
+
"_id" => sha,
|
35
|
+
"title" => title.to_s || "Untitled Document: #{HumanHash::HumanHasher.new.humanize(sha)}",
|
32
36
|
"source_url" => download_url.to_s,
|
33
37
|
"file" => {
|
34
|
-
"title" => title.to_s,
|
38
|
+
"title" => title.to_s || "Untitled Document: #{HumanHash::HumanHasher.new.humanize(sha)}",
|
35
39
|
"file" => clean_text.to_s
|
36
40
|
},
|
37
41
|
"analyzed" => {
|
@@ -40,7 +44,7 @@ module Stevedore
|
|
40
44
|
"Content-Type" => extra["Content-Type"] || "text/plain"
|
41
45
|
}
|
42
46
|
},
|
43
|
-
"_updatedAt" => Time.now
|
47
|
+
"_updatedAt" => Time.now,
|
44
48
|
}
|
45
49
|
end
|
46
50
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'digest/sha1'
|
2
2
|
|
3
3
|
module Stevedore
|
4
|
-
class StevedoreCsvRow
|
4
|
+
class StevedoreCsvRow < StevedoreBlob
|
5
5
|
attr_accessor :title, :text, :download_url, :whole_row, :row_num
|
6
6
|
def initialize(title, text, row_num, download_url, whole_row={})
|
7
7
|
self.title = title || download_url
|
@@ -2,12 +2,15 @@ require_relative './stevedore_blob'
|
|
2
2
|
require 'cgi'
|
3
3
|
require 'digest/sha1'
|
4
4
|
require 'manticore'
|
5
|
+
require 'dkimverify'
|
6
|
+
|
7
|
+
|
5
8
|
module Stevedore
|
6
9
|
class StevedoreEmail < StevedoreBlob
|
7
10
|
|
8
11
|
|
9
12
|
# TODO write wrt other fields. where do those go???
|
10
|
-
attr_accessor :creation_date, :message_to, :message_from, :message_cc, :subject, :attachments, :content_type
|
13
|
+
attr_accessor :creation_date, :message_to, :message_from, :message_cc, :subject, :attachments, :content_type, :dkim_verified
|
11
14
|
|
12
15
|
def self.new_from_tika(content, metadata, download_url, filepath)
|
13
16
|
t = super
|
@@ -16,6 +19,11 @@ module Stevedore
|
|
16
19
|
t.message_from = metadata["Message-From"]
|
17
20
|
t.message_cc = metadata["Message-Cc"]
|
18
21
|
t.title = t.subject = metadata["subject"]
|
22
|
+
t.dkim_verified = begin
|
23
|
+
Dkim::Verifier.new(filepath).verify!
|
24
|
+
rescue Dkim::DkimError
|
25
|
+
false
|
26
|
+
end
|
19
27
|
t.attachments = metadata["X-Attachments"].to_s.split("|").map do |raw_attachment_filename|
|
20
28
|
attachment_filename = CGI::unescape(raw_attachment_filename)
|
21
29
|
possible_filename = File.join(File.dirname(filepath), attachment_filename)
|
@@ -72,7 +80,8 @@ module Stevedore
|
|
72
80
|
"Message-From" => message_to.is_a?(Enumerable) ? message_to : [ message_to ],
|
73
81
|
"Message-Cc" => message_cc.is_a?(Enumerable) ? message_cc : [ message_cc ],
|
74
82
|
"subject" => subject,
|
75
|
-
"attachments" => attachments
|
83
|
+
"attachments" => attachments,
|
84
|
+
"dkim_verified" => dkim_verified
|
76
85
|
}
|
77
86
|
},
|
78
87
|
"_updatedAt" => Time.now
|
data/lib/stevedore-uploader.rb
CHANGED
@@ -138,7 +138,7 @@ module Stevedore
|
|
138
138
|
# TODO: factor these out in favor of the yield/block situation down below.
|
139
139
|
# this should (eventually) be totally generic, but perhaps handle common
|
140
140
|
# document types on its own
|
141
|
-
|
141
|
+
doc = case # .eml # .msg
|
142
142
|
when metadata["Content-Type"] == "message/rfc822" || metadata["Content-Type"] == "application/vnd.ms-outlook"
|
143
143
|
::Stevedore::StevedoreEmail.new_from_tika(content, metadata, download_url, filename).to_hash
|
144
144
|
when metadata["Content-Type"] && ["application/html", "application/xhtml+xml"].include?(metadata["Content-Type"].split(";").first)
|
@@ -156,16 +156,19 @@ module Stevedore
|
|
156
156
|
File.delete("#{png}.txt") rescue nil
|
157
157
|
end.join("\n\n")
|
158
158
|
# e.g. Analysis-Corporation-2.png.pdf or Torture.pdf
|
159
|
-
files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)[1].to_i })
|
160
|
-
|
161
|
-
|
162
|
-
|
159
|
+
files = Dir["#{pdf_basename}.png.pdf"] + (Dir["#{pdf_basename}-*.png.pdf"].sort_by{|pdf| (m = Regexp.new("#{pdf_basename}-([0-9]+).png.pdf").match(pdf)) ? m[1].to_i : 69420 }) # 69420 is a random really big number, sorting those docs to the end.
|
160
|
+
if files.empty?
|
161
|
+
content = ''
|
162
|
+
else
|
163
|
+
system('pdftk', *files, "cat", "output", "#{pdf_basename}.ocr.pdf")
|
164
|
+
content, _ = Rika.parse_content_and_metadata("#{pdf_basename}.ocr.pdf")
|
165
|
+
end
|
163
166
|
puts "OCRed content (#{File.basename(filename)}) length: #{content.length}"
|
164
167
|
::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
|
165
168
|
else
|
166
169
|
::Stevedore::StevedoreBlob.new_from_tika(content, metadata, download_url, filename).to_hash
|
167
170
|
end
|
168
|
-
[
|
171
|
+
[doc, content, metadata]
|
169
172
|
rescue StandardError, java.lang.NoClassDefFoundError, org.apache.tika.exception.TikaException => e
|
170
173
|
STDERR.puts e.inspect
|
171
174
|
STDERR.puts "#{e} #{e.message}: #{filename}"
|
@@ -220,7 +223,6 @@ module Stevedore
|
|
220
223
|
s3_path_without_bucket = target_path.gsub(/s3:\/\//i, '').split("/", 2).last
|
221
224
|
bucket.objects(:prefix => s3_path_without_bucket).each_slice(@slice_size) do |slice_of_objs|
|
222
225
|
docs_so_far += slice_of_objs.size
|
223
|
-
|
224
226
|
output_stream.puts "starting a set of #{@slice_size} -- so far #{docs_so_far}"
|
225
227
|
slice_of_objs.map! do |obj|
|
226
228
|
next if obj.key[-1] == "/"
|
@@ -244,12 +246,19 @@ module Stevedore
|
|
244
246
|
if ArchiveSplitter::HANDLED_FORMATS.include?(tmp_filename.split(".")[-1])
|
245
247
|
ArchiveSplitter.split(tmp_filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
|
246
248
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
249
|
+
next nil if doc.nil?
|
247
250
|
doc["analyzed"] ||= {}
|
248
251
|
doc["analyzed"]["metadata"] ||= {}
|
252
|
+
|
253
|
+
# this is a hack: but we're replicating how IDs are calculated (in parsers/stevedore_blob.rb) to make "attachments" the list of IDs of all documents in the archive
|
254
|
+
# we have to set separate sha1s for these, because they're by default based only on the download URL (which is the same for all of the constituent files)
|
249
255
|
doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
|
250
256
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
257
|
+
doc["id"] = doc["sha1"]
|
258
|
+
doc["_id"] = doc["sha1"]
|
251
259
|
yield doc, obj.key, content, metadata if block_given?
|
252
260
|
FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
|
261
|
+
doc["file"]["title"] ||= "Untitled Document: #{HumanHash::HumanHasher.new.humanize(doc["_id"])}"
|
253
262
|
doc
|
254
263
|
end
|
255
264
|
else
|
@@ -300,14 +309,20 @@ module Stevedore
|
|
300
309
|
if ArchiveSplitter::HANDLED_FORMATS.include?(filename.split(".")[-1])
|
301
310
|
ArchiveSplitter.split(filename).map do |constituent_file, constituent_basename, attachment_basenames, parent_basename|
|
302
311
|
doc, content, metadata = process_document(constituent_file, download_filename)
|
303
|
-
|
312
|
+
next nil if doc.nil?
|
304
313
|
doc["analyzed"] ||= {}
|
305
314
|
doc["analyzed"]["metadata"] ||= {}
|
315
|
+
|
316
|
+
# this is a hack: but we're replicating how IDs are calculated (in parsers/stevedore_blob.rb) to make "attachments" the list of IDs of all documents in the archive
|
317
|
+
# we have to set separate sha1s for these, because they're by default based only on the download URL (which is the same for all of the constituent files)
|
306
318
|
doc["analyzed"]["metadata"]["attachments"] = (parent_basename.nil? ? [] : [Digest::SHA1.hexdigest(download_filename + parent_basename)]) + attachment_basenames.map{|attachment| Digest::SHA1.hexdigest(download_filename + attachment) } # is a list of filenames
|
307
319
|
doc["sha1"] = Digest::SHA1.hexdigest(download_filename + File.basename(constituent_basename)) # since these files all share a download URL (that of the archive, we need to come up with a custom sha1)
|
308
320
|
doc["id"] = doc["sha1"]
|
321
|
+
doc["_id"] = doc["sha1"]
|
322
|
+
|
309
323
|
yield doc, filename, content, metadata if block_given?
|
310
324
|
# FileUtils.rm(constituent_file) rescue Errno::ENOENT # try to delete, but no biggie if it doesn't work for some weird reason.
|
325
|
+
puts doc.inspect
|
311
326
|
doc
|
312
327
|
end
|
313
328
|
else
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stevedore-uploader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.9
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -71,7 +71,7 @@ dependencies:
|
|
71
71
|
requirements:
|
72
72
|
- - "~>"
|
73
73
|
- !ruby/object:Gem::Version
|
74
|
-
version: 1.7
|
74
|
+
version: '1.7'
|
75
75
|
name: rika-stevedore
|
76
76
|
prerelease: false
|
77
77
|
type: :runtime
|
@@ -79,7 +79,7 @@ dependencies:
|
|
79
79
|
requirements:
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version: 1.7
|
82
|
+
version: '1.7'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
84
|
requirement: !ruby/object:Gem::Requirement
|
85
85
|
requirements:
|
@@ -136,6 +136,34 @@ dependencies:
|
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '1.1'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
requirement: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - "~>"
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: 0.1.0
|
145
|
+
name: humanhash
|
146
|
+
prerelease: false
|
147
|
+
type: :runtime
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.1.0
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
requirement: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - "~>"
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: 0.0.4
|
159
|
+
name: dkimverify
|
160
|
+
prerelease: false
|
161
|
+
type: :runtime
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - "~>"
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: 0.0.4
|
139
167
|
description: TK
|
140
168
|
email: jeremy.merrill@nytimes.com
|
141
169
|
executables: []
|
@@ -170,7 +198,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
170
198
|
version: '0'
|
171
199
|
requirements: []
|
172
200
|
rubyforge_project:
|
173
|
-
rubygems_version: 2.
|
201
|
+
rubygems_version: 2.6.6
|
174
202
|
signing_key:
|
175
203
|
specification_version: 4
|
176
204
|
summary: Upload documents to a Stevedore search engine.
|