stevedore-uploader 1.0.14-java → 1.0.15-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 594d704c741c5af52ab9953ab334483b51b6d0ed
4
- data.tar.gz: d7fda45ea1116f25fdb0b98537f9d7f2ffab2ab9
3
+ metadata.gz: 592883a568d09ec2fa6a6155ac06409f96b01386
4
+ data.tar.gz: 6aaa836860bcdd8cbc432d92ade4bb61616f9a67
5
5
  SHA512:
6
- metadata.gz: ec87c6dd3fac2105b428a20810c7ddfb8f24010ce28369dab755032b0705ea787e86103a7ad887cd5bdd014614105c95f1f5dae431d11b0624a0b96eabb53dbe
7
- data.tar.gz: 2f642f39598fbc5be89fc814387212defbf71794e81c125327fa053b8f487ad3444faa16e447ea943c3d1f2d141d302aae2f8f0e11e0683b42934beb429f4f57
6
+ metadata.gz: 6993d8f2bf553ec22560f50e95bd5f1415aace8f11c43dd335ae80cf5b955c7fb50afe7b66f98055375f1a49e8442add0a9f65b8e2013ef43bcb5c319aeded74
7
+ data.tar.gz: 0dde17b5fe9791d24d5342448911f704ad93fedcaf715d040ebb0a323ea84d369f7f91389cc7be6472a3740583936d5b6ad0b3554b4a97c997589b8406c22fa7
data/bin/stevedore CHANGED
@@ -6,69 +6,67 @@ raise Exception, "You've gotta use Java 1.8; you're on #{java.lang.System.getPro
6
6
 
7
7
  require "#{File.expand_path(File.dirname(__FILE__))}/../lib/stevedore-uploader.rb"
8
8
 
9
- if __FILE__ == $0
10
- require 'optparse'
11
- require 'ostruct'
12
- options = OpenStruct.new
13
- options.ocr = true
14
-
15
- op = OptionParser.new("Usage: stevedore [options] target_(dir_or_csv)") do |opts|
16
- opts.on("-hSERVER:PORT", "--host=SERVER:PORT",
17
- "The location of the ElasticSearch server") do |host|
18
- options.host = host
19
- end
20
-
21
- opts.on("-iNAME", "--index=NAME",
22
- "A name to use for the ES index (defaults to using the directory name)") do |index|
23
- options.index = index
24
- end
25
-
26
- opts.on("-sPATH", "--s3path=PATH",
27
- "The path under your bucket where these files have been uploaded. (defaults to ES index)"
28
- ) do |s3path|
29
- options.s3path = s3path
30
- end
31
- opts.on("-bPATH", "--s3bucket=PATH",
32
- "The s3 bucket where these files have already been be uploaded (or will be later)."
33
- ) do |s3bucket|
34
- options.s3bucket = s3bucket
35
- end
36
-
37
- opts.on("--title-column=COLNAME",
38
- "If target file is a CSV, which column contains the title of the row. Integer index or string column name."
39
- ) do |title_column|
40
- options.title_column = title_column
41
- end
42
- opts.on("--text-column=COLNAME",
43
- "If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
44
- ) do |text_column|
45
- options.text_column = text_column
46
- end
47
- opts.on("--slice-size=SLICE",
48
- "Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
49
- ) do |slice_size|
50
- options.slice_size = slice_size.to_i
51
- end
52
-
53
- opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
54
- options.ocr = v
55
- end
56
-
57
- opts.on( '-?', '--help', 'Display this screen' ) do
58
- puts opts
59
- exit
60
- end
9
+ require 'optparse'
10
+ require 'ostruct'
11
+ options = OpenStruct.new
12
+ options.ocr = true
13
+
14
+ op = OptionParser.new("Usage: stevedore [options] target_(dir_or_csv)") do |opts|
15
+ opts.on("-hSERVER:PORT", "--host=SERVER:PORT",
16
+ "The location of the ElasticSearch server") do |host|
17
+ options.host = host
61
18
  end
62
19
 
63
- op.parse!
20
+ opts.on("-iNAME", "--index=NAME",
21
+ "A name to use for the ES index (defaults to using the directory name)") do |index|
22
+ options.index = index
23
+ end
64
24
 
65
- # to delete an index: curl -X DELETE localhost:9200/indexname/
66
- unless ARGV.length == 1
67
- puts op
25
+ opts.on("-sPATH", "--s3path=PATH",
26
+ "The path under your bucket where these files have been uploaded. (defaults to ES index)"
27
+ ) do |s3path|
28
+ options.s3path = s3path
29
+ end
30
+ opts.on("-bPATH", "--s3bucket=PATH",
31
+ "The s3 bucket where these files have already been be uploaded (or will be later)."
32
+ ) do |s3bucket|
33
+ options.s3bucket = s3bucket
34
+ end
35
+
36
+ opts.on("--title-column=COLNAME",
37
+ "If target file is a CSV, which column contains the title of the row. Integer index or string column name."
38
+ ) do |title_column|
39
+ options.title_column = title_column
40
+ end
41
+ opts.on("--text-column=COLNAME",
42
+ "If target file is a CSV, which column contains the main, searchable of the row. Integer index or string column name."
43
+ ) do |text_column|
44
+ options.text_column = text_column
45
+ end
46
+ opts.on("--slice-size=SLICE",
47
+ "Process documents in batches of SLICE. Default is 100. Lower this if you get timeouts. Raise it to go faster."
48
+ ) do |slice_size|
49
+ options.slice_size = slice_size.to_i
50
+ end
51
+
52
+ opts.on("-o", "--[no-]ocr", "don't attempt to OCR any PDFs, even if they contain no text") do |v|
53
+ options.ocr = v
54
+ end
55
+
56
+ opts.on( '-?', '--help', 'Display this screen' ) do
57
+ puts opts
68
58
  exit
69
59
  end
70
60
  end
71
61
 
62
+ op.parse!
63
+
64
+ # to delete an index: curl -X DELETE localhost:9200/indexname/
65
+ unless ARGV.length == 1
66
+ puts op
67
+ exit
68
+ end
69
+
72
70
  # you can provide either a path to files locally or
73
71
  # an S3 endpoint as s3://int-data-dumps/YOURINDEXNAME
74
72
  FOLDER = ARGV.shift
@@ -93,28 +91,22 @@ S3_BASEPATH = "https://#{S3_BUCKET}.s3.amazonaws.com/#{S3_PATH}"
93
91
  raise ArgumentError, "specify a destination" unless FOLDER
94
92
  raise ArgumentError, "specify the elasticsearch host" unless ES_HOST
95
93
 
96
- ###############################
97
- # actual stuff
98
- ###############################
99
-
100
- if __FILE__ == $0
101
- f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
102
- f.should_ocr = options.ocr
103
- puts "Will not OCR, per --no-ocr option" unless f.should_ocr
104
- f.slice_size = options.slice_size if options.slice_size
105
- puts "Slice size set to #{f.slice_size}" if options.slice_size
106
-
107
- if FOLDER.match(/\.[ct]sv$/)
108
- f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)
109
- else
110
- f.do!(FOLDER)
111
- end
112
- puts "Finished uploading documents at #{Time.now}"
94
+ f = Stevedore::ESUploader.new(ES_HOST, ES_INDEX, S3_BUCKET, S3_BASEPATH)
95
+ f.should_ocr = options.ocr
96
+ puts "Will not OCR, per --no-ocr option" unless f.should_ocr
97
+ f.slice_size = options.slice_size if options.slice_size
98
+ puts "Slice size set to #{f.slice_size}" if options.slice_size
113
99
 
114
- puts "Created Stevedore for #{ES_INDEX}; go check out https://stevedore.newsdev.net/search/#{ES_INDEX} or http://stevedore.adm.prd.newsdev.nytimes.com/search/#{ES_INDEX}"
115
- if f.errors.size > 0
116
- STDERR.puts "#{f.errors.size} failed documents:"
117
- STDERR.puts f.errors.inspect
118
- puts "Uploading successful, but with #{f.errors.size} errors."
119
- end
100
+ if FOLDER.match(/\.[ct]sv$/)
101
+ f.do_csv!(FOLDER, File.join(f.s3_basepath, File.basename(FOLDER)), options.title_column, options.text_column)
102
+ else
103
+ f.do!(FOLDER)
104
+ end
105
+ puts "Finished uploading documents at #{Time.now}"
106
+
107
+ puts "Created Stevedore for #{ES_INDEX}; go check out https://stevedore.newsdev.net/search/#{ES_INDEX} or http://stevedore.adm.prd.newsdev.nytimes.com/search/#{ES_INDEX}"
108
+ if f.errors.size > 0
109
+ STDERR.puts "#{f.errors.size} failed documents:"
110
+ STDERR.puts f.errors.inspect
111
+ puts "Uploading successful, but with #{f.errors.size} errors."
120
112
  end
@@ -1,6 +1,6 @@
1
1
 
2
2
  require 'rika'
3
- require 'jruby-openssl'
3
+ require 'openssl'
4
4
  require 'net/https'
5
5
  require 'elasticsearch'
6
6
  require 'elasticsearch/transport/transport/http/manticore'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stevedore-uploader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.14
4
+ version: 1.0.15
5
5
  platform: java
6
6
  authors:
7
7
  - Jeremy B. Merrill