stevedore-uploader 1.0.14-java → 1.0.15-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 594d704c741c5af52ab9953ab334483b51b6d0ed
4
- data.tar.gz: d7fda45ea1116f25fdb0b98537f9d7f2ffab2ab9
3
+ metadata.gz: 592883a568d09ec2fa6a6155ac06409f96b01386
4
+ data.tar.gz: 6aaa836860bcdd8cbc432d92ade4bb61616f9a67
5
5
  SHA512:
6
- metadata.gz: ec87c6dd3fac2105b428a20810c7ddfb8f24010ce28369dab755032b0705ea787e86103a7ad887cd5bdd014614105c95f1f5dae431d11b0624a0b96eabb53dbe
7
- data.tar.gz: 2f642f39598fbc5be89fc814387212defbf71794e81c125327fa053b8f487ad3444faa16e447ea943c3d1f2d141d302aae2f8f0e11e0683b42934beb429f4f57
6
+ metadata.gz: 6993d8f2bf553ec22560f50e95bd5f1415aace8f11c43dd335ae80cf5b955c7fb50afe7b66f98055375f1a49e8442add0a9f65b8e2013ef43bcb5c319aeded74
7
+ data.tar.gz: 0dde17b5fe9791d24d5342448911f704ad93fedcaf715d040ebb0a323ea84d369f7f91389cc7be6472a3740583936d5b6ad0b3554b4a97c997589b8406c22fa7
data/bin/stevedore CHANGED
@@ -6,69 +6,67 @@ raise Exception, "You've gotta use Java 1.8; you're on #{java.lang.System.getPro
6
6
 
7
7
  require "#{File.expand_path(File.dirname(__FILE__))}/../lib/stevedore-uploader.rb"
8
8
 
9
- if __FILE__ == $0
10
- require 'optparse'
11
- require 'ostruct'
12
- options = OpenStruct.new
13
- options.ocr = true
14
-
15
- op = OptionParser.new("Usage: stevedore [options] target_(dir_or_csv)") do |opts|
16
- opts.on("-hSERVER:PORT", "--host=SERVER:PORT",
17
- "The location of the ElasticSearch server") do |host|
18
- options.host = host
19
- end
20
-
21
- opts.on("-iNAME", "--index=NAME",
22
- "A name to use for the ES index (defaults to using the directory name)") do |index|
23
- options.index = index
24
- end
25
-
26
- opts.on("-sPATH", "--s3path=PATH",
27
- "The path under your bucket where these files have been uploaded. (defaults to ES index)"
28
- ) do |s3path|
29
- options.s3path = s3path
30
- end
31
- opts.on("-bPATH", "--s3bucket=PATH",
32
- "The s3 bucket where these files have already been be uploaded (or will be later)."
33
- ) do |s3bucket|
34
- options.s3bucket = s3bucket
35
- end
36
-
37
- opts.on("--title-column=COLNAME",
38
- "If target file is a CSV, which column contains the title of the row. Integer index or string column name."
39
- ) do |title_column|
40
- options.title_column = title_column
41
- end
42
- opts.on("--text-column=COLNAME",
43
- "If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
44
- ) do |text_column|
45
- options.text_column = text_column
46
- end
47
- opts.on("--slice-size=SLICE",
48
- "Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
49
- ) do |slice_size|
50
- options.slice_size = slice_size.to_i
51
- end
52
-
53
- opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
54
- options.ocr = v
55
- end
56
-
57
- opts.on( '-?', '--help', 'Display this screen' ) do
58
- puts opts
59
- exit
60
- end
9
+ require 'optparse'
10
+ require 'ostruct'
11
+ options = OpenStruct.new
12
+ options.ocr = true
13
+
14
+ op = OptionParser.new("Usage: stevedore [options] target_(dir_or_csv)") do |opts|
15
+ opts.on("-hSERVER:PORT", "--host=SERVER:PORT",
16
+ "The location of the ElasticSearch server") do |host|
17
+ options.host = host
61
18
  end
62
19
 
63
- op.parse!
20
+ opts.on("-iNAME", "--index=NAME",
21
+ "A name to use for the ES index (defaults to using the directory name)") do |index|
22
+ options.index = index
23
+ end
64
24
 
65
- # to delete an index: curl -X DELETE localhost:9200/indexname/
66
- unless ARGV.length == 1
67
- puts op
25
+ opts.on("-sPATH", "--s3path=PATH",
26
+ "The path under your bucket where these files have been uploaded. (defaults to ES index)"
27
+ ) do |s3path|
28
+ options.s3path = s3path
29
+ end
30
+ opts.on("-bPATH", "--s3bucket=PATH",
31
+ "The s3 bucket where these files have already been be uploaded (or will be later)."
32
+ ) do |s3bucket|
33
+ options.s3bucket = s3bucket
34
+ end
35
+
36
+ opts.on("--title-column=COLNAME",
37
+ "If target file is a CSV, which column contains the title of the row. Integer index or string column name."
38
+ ) do |title_column|
39
+ options.title_column = title_column
40
+ end
41
+ opts.on("--text-column=COLNAME",
42
+ "If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
43
+ ) do |text_column|
44
+ options.text_column = text_column
45
+ end
46
+ opts.on("--slice-size=SLICE",
47
+ "Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
48
+ ) do |slice_size|
49
+ options.slice_size = slice_size.to_i
50
+ end
51
+
52
+ opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
53
+ options.ocr = v
54
+ end
55
+
56
+ opts.on( '-?', '--help', 'Display this screen' ) do
57
+ puts opts
68
58
  exit
69
59
  end
70
60
  end
71
61
 
62
+ op.parse!
63
+
64
+ # to delete an index: curl -X DELETE localhost:9200/indexname/
65
+ unless ARGV.length == 1
66
+ puts op
67
+ exit
68
+ end
69
+
72
70
  # you can provide either a path to files locally or
73
71
  # an S3 endpoint as s3://int-data-dumps/YOURINDEXNAME
74
72
  FOLDER = ARGV.shift
@@ -93,28 +91,22 @@ S3_BASEPATH = "https://#{S3_BUCKET}.s3.amazonaws.com/#{S3_PATH}"
93
91
  raise ArgumentError, "specify a destination" unless FOLDER
94
92
  raise ArgumentError, "specify the elasticsearch host" unless ES_HOST
95
93
 
96
- ###############################
97
- # actual stuff
98
- ###############################
99
-
100
- if __FILE__ == $0
101
- f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
102
- f.should_ocr = options.ocr
103
- puts "Will not OCR, per --no-ocr option" unless f.should_ocr
104
- f.slice_size = options.slice_size if options.slice_size
105
- puts "Slice size set to #{f.slice_size}" if options.slice_size
106
-
107
- if FOLDER.match(/\.[ct]sv$/)
108
- f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)
109
- else
110
- f.do!(FOLDER)
111
- end
112
- puts "Finished uploading documents at #{Time.now}"
94
+ f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
95
+ f.should_ocr = options.ocr
96
+ puts "Will not OCR, per --no-ocr option" unless f.should_ocr
97
+ f.slice_size = options.slice_size if options.slice_size
98
+ puts "Slice size set to #{f.slice_size}" if options.slice_size
113
99
 
114
- puts "Created Stevedore for #{ES_INDEX}; go check out https://stevedore.newsdev.net/search/#{ES_INDEX} or http://stevedore.adm.prd.newsdev.nytimes.com/search/#{ES_INDEX}"
115
- if f.errors.size > 0
116
- STDERR.puts "#{f.errors.size} failed documents:"
117
- STDERR.puts f.errors.inspect
118
- puts "Uploading successful, but with #{f.errors.size} errors."
119
- end
100
+ if FOLDER.match(/\.[ct]sv$/)
101
+ f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)
102
+ else
103
+ f.do!(FOLDER)
104
+ end
105
+ puts "Finished uploading documents at #{Time.now}"
106
+
107
+ puts "Created Stevedore for #{ES_INDEX}; go check out https://stevedore.newsdev.net/search/#{ES_INDEX} or http://stevedore.adm.prd.newsdev.nytimes.com/search/#{ES_INDEX}"
108
+ if f.errors.size > 0
109
+ STDERR.puts "#{f.errors.size} failed documents:"
110
+ STDERR.puts f.errors.inspect
111
+ puts "Uploading successful, but with #{f.errors.size} errors."
120
112
  end
@@ -1,6 +1,6 @@
1
1
 
2
2
  require 'rika'
3
- require 'jruby-openssl'
3
+ require 'openssl'
4
4
  require 'net/https'
5
5
  require 'elasticsearch'
6
6
  require 'elasticsearch/transport/transport/http/manticore'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stevedore-uploader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.14
4
+ version: 1.0.15
5
5
  platform: java
6
6
  authors:
7
7
  - Jeremy B. Merrill