elasticrawl 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,130 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::ParseJob do
4
+ describe '#set_segments' do
5
+ let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
6
+ let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
7
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
8
+ let(:max_files) { 5 }
9
+ let(:parse_job) { Elasticrawl::ParseJob.new }
10
+
11
+ before do
12
+ crawl.create_segments
13
+ parse_job.set_segments(crawl.crawl_segments[0..1], max_files)
14
+ end
15
+
16
+ it 'should have a job name based on current time' do
17
+ expect(parse_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8)
18
+ end
19
+
20
+ it 'should have a job desc' do
21
+ expect(parse_job.job_desc).to eq job_desc
22
+ end
23
+
24
+ it 'should create 2 job steps' do
25
+ expect(parse_job.job_steps.count).to eq 2
26
+ end
27
+
28
+ it 'should set steps input path to segment uri' do
29
+ input_path = parse_job.job_steps[0].input_paths
30
+ segment_uri = crawl.crawl_segments[0].segment_s3_uri
31
+
32
+ expect(input_path.starts_with?(segment_uri)).to eq true
33
+ end
34
+
35
+ it 'should set output path' do
36
+ output_path = parse_job.job_steps[0].output_path
37
+ segment_name = crawl.crawl_segments[0].segment_name
38
+
39
+ expect(output_path.include?(parse_job.job_name)).to eq true
40
+ expect(output_path.include?(segment_name)).to eq true
41
+ end
42
+ end
43
+
44
+ describe '#confirm_message' do
45
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
46
+ let(:job) { Elasticrawl::ParseJob.new }
47
+ let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: 5 files per segment' }
48
+ let(:cluster_desc) {
49
+ cluster_desc = <<-HERE
50
+ Cluster configuration
51
+ Master: 1 m1.medium (Spot: 0.12)
52
+ Core: 2 m1.medium (Spot: 0.12)
53
+ Task: --
54
+ HERE
55
+ }
56
+
57
+ before do
58
+ crawl.create_segments
59
+ job.set_segments(crawl.crawl_segments[0..2], 5)
60
+ end
61
+
62
+ it 'should display message including job desc' do
63
+ expect(job.confirm_message.include?(job_desc)).to eq true
64
+ end
65
+
66
+ it 'should display message including cluster desc' do
67
+ expect(job.confirm_message.include?(cluster_desc)).to eq true
68
+ end
69
+ end
70
+
71
+ describe '#run' do
72
+ let(:crawl_name) { 'CC-MAIN-2013-20' }
73
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
74
+ let(:job) { Elasticrawl::ParseJob.new }
75
+ let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
76
+
77
+ before do
78
+ crawl.create_segments
79
+ job.set_segments(crawl.crawl_segments[0..1], 5)
80
+
81
+ Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
82
+ job.run
83
+ end
84
+
85
+ it 'should set a job flow id' do
86
+ expect(job.job_flow_id).to eq job_flow_id
87
+ end
88
+
89
+ it 'should set parse time for parsed segments' do
90
+ expect(crawl.crawl_segments[0].parse_time.present?).to eq true
91
+ expect(crawl.crawl_segments[1].parse_time.present?).to eq true
92
+ expect(crawl.crawl_segments[2].parse_time.present?).to eq false
93
+ end
94
+ end
95
+
96
+ describe '#log_uri' do
97
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
98
+ let(:job) { Elasticrawl::ParseJob.new }
99
+
100
+ before do
101
+ crawl.create_segments
102
+ job.set_segments(crawl.crawl_segments)
103
+ end
104
+
105
+ it 'should set a log uri including the job name' do
106
+ expect(job.log_uri).to eq "s3://elasticrawl/logs/1-parse/#{job.job_name}/"
107
+ end
108
+ end
109
+
110
+ describe '#history' do
111
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
112
+ let(:job) { Elasticrawl::ParseJob.new }
113
+ let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: all files' }
114
+ let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
115
+
116
+ before do
117
+ crawl.create_segments
118
+ job.set_segments(crawl.crawl_segments)
119
+
120
+ Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
121
+ job.run
122
+ end
123
+
124
+ it 'should return the job name, history and launch time' do
125
+ expect(job.history.include?(job.job_name)).to eq true
126
+ expect(job.history.include?(job.job_desc)).to eq true
127
+ expect(job.history.include?(job.created_at.strftime('%Y-%m-%d %H:%M:%S'))).to eq true
128
+ end
129
+ end
130
+ end
data/templates/aws.yml ADDED
@@ -0,0 +1,7 @@
1
+ # Configures the AWS access credentials used when calling the AWS
2
+ # Elastic MapReduce and S3 APIs. This file is populated by the init command.
3
+ #
4
+ # Instead of configuring this file you can set the environment variables
5
+ # AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.
6
+ access_key_id: 'ACCESS_KEY_ID'
7
+ secret_access_key: 'SECRET_ACCESS_KEY'
@@ -0,0 +1,44 @@
1
+ # Configures the Elastic MapReduce cluster that is launched to run parse and
2
+ # combine jobs. The list of EC2 instance types can be found at
3
+ # http://aws.amazon.com/ec2/instance-types/#instance-details
4
+
5
+ # Using spot instances is recommended to reduce costs. However if the spot
6
+ # price rises above your bid price the cluster may be terminated. Elasticrawl
7
+ # tries to reduce the effect of this by parsing each Commmon Crawl segment
8
+ # in a separate job flow step.
9
+
10
+ # The master node manages the cluster.
11
+ master_instance_group:
12
+ instance_type: m1.medium
13
+ use_spot_instances: true
14
+ bid_price: 0.120
15
+
16
+ # Core nodes run map and reduce tasks and store data using HDFS.
17
+ core_instance_group:
18
+ instance_type: m1.medium
19
+ instance_count: 2
20
+ use_spot_instances: true
21
+ bid_price: 0.120
22
+
23
+ # Task nodes are optional and only run map and reduce tasks.
24
+ task_instance_group:
25
+ instance_type: m1.small
26
+ instance_count: 0
27
+ use_spot_instances: true
28
+ bid_price: 0.080
29
+
30
+ # Array of bootstrap scripts that will be applied when the cluster nodes are
31
+ # initialized. The example installs the Ganglia distributed monitoring system.
32
+ bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia']
33
+
34
+ # Specifying an EC2 key pair allows SSH access to the master node. This also
35
+ # allows accessing the Hadoop Web UI over an SSH tunnel.
36
+ ec2_key_name: # 'key-pair-name'
37
+
38
+ # Availability Zone (AZ) to launch instances in. An AZ in the US-East region is
39
+ # recommended since the Common Crawl corpus is stored there. Otherwise inter
40
+ # region data transfer charges will apply.
41
+ placement: 'us-east-1a'
42
+
43
+ # The AMI version to use when launching instances.
44
+ emr_ami_version: 'latest'
@@ -0,0 +1,31 @@
1
+ # Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
2
+ # corpus.
3
+
4
+ # An S3 bucket is created by the init command and is used to store data and logs.
5
+ s3_bucket_name: 'BUCKET_NAME'
6
+
7
+ # A parse step is created per Common Crawl segment. A combine step takes the
8
+ # results from multiple segments to create a single set of output files.
9
+
10
+ # The parse input filter is used to specify the Common Crawl file type.
11
+
12
+ # WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
13
+ # WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
14
+ # WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.
15
+
16
+ # The EMR config is an XML file that sets Hadoop properties. If a config file
17
+ # is specified then a bootstrap action is run on each node to apply it.
18
+ steps:
19
+ # Parse step for the Example Elasticrawl JAR. This does a word count
20
+ # against the text extractions of the corpus.
21
+ parse:
22
+ jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
23
+ class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
24
+ input_filter: 'wet/*.warc.wet.gz'
25
+ emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
26
+ # Combine step for the Example Elasticrawl JAR.
27
+ combine:
28
+ jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
29
+ class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
30
+ input_filter: 'part-*'
31
+ emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'
metadata ADDED
@@ -0,0 +1,315 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: elasticrawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ross Fairbanks
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-02-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: activerecord
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 4.0.2
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 4.0.2
30
+ - !ruby/object:Gem::Dependency
31
+ name: activesupport
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 4.0.2
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 4.0.2
46
+ - !ruby/object:Gem::Dependency
47
+ name: aws-sdk
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: '1.0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: elasticity
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: '2.7'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: '2.7'
78
+ - !ruby/object:Gem::Dependency
79
+ name: highline
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ~>
84
+ - !ruby/object:Gem::Version
85
+ version: 1.6.20
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: 1.6.20
94
+ - !ruby/object:Gem::Dependency
95
+ name: sqlite3
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ~>
100
+ - !ruby/object:Gem::Version
101
+ version: 1.3.8
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ~>
108
+ - !ruby/object:Gem::Version
109
+ version: 1.3.8
110
+ - !ruby/object:Gem::Dependency
111
+ name: thor
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: 0.18.1
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ~>
124
+ - !ruby/object:Gem::Version
125
+ version: 0.18.1
126
+ - !ruby/object:Gem::Dependency
127
+ name: rake
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :development
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
142
+ - !ruby/object:Gem::Dependency
143
+ name: bundler
144
+ requirement: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ~>
148
+ - !ruby/object:Gem::Version
149
+ version: '1.3'
150
+ type: :development
151
+ prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ~>
156
+ - !ruby/object:Gem::Version
157
+ version: '1.3'
158
+ - !ruby/object:Gem::Dependency
159
+ name: rspec
160
+ requirement: !ruby/object:Gem::Requirement
161
+ none: false
162
+ requirements:
163
+ - - ~>
164
+ - !ruby/object:Gem::Version
165
+ version: 2.14.1
166
+ type: :development
167
+ prerelease: false
168
+ version_requirements: !ruby/object:Gem::Requirement
169
+ none: false
170
+ requirements:
171
+ - - ~>
172
+ - !ruby/object:Gem::Version
173
+ version: 2.14.1
174
+ - !ruby/object:Gem::Dependency
175
+ name: mocha
176
+ requirement: !ruby/object:Gem::Requirement
177
+ none: false
178
+ requirements:
179
+ - - ~>
180
+ - !ruby/object:Gem::Version
181
+ version: 1.0.0
182
+ type: :development
183
+ prerelease: false
184
+ version_requirements: !ruby/object:Gem::Requirement
185
+ none: false
186
+ requirements:
187
+ - - ~>
188
+ - !ruby/object:Gem::Version
189
+ version: 1.0.0
190
+ - !ruby/object:Gem::Dependency
191
+ name: database_cleaner
192
+ requirement: !ruby/object:Gem::Requirement
193
+ none: false
194
+ requirements:
195
+ - - ~>
196
+ - !ruby/object:Gem::Version
197
+ version: 1.2.0
198
+ type: :development
199
+ prerelease: false
200
+ version_requirements: !ruby/object:Gem::Requirement
201
+ none: false
202
+ requirements:
203
+ - - ~>
204
+ - !ruby/object:Gem::Version
205
+ version: 1.2.0
206
+ - !ruby/object:Gem::Dependency
207
+ name: shoulda-matchers
208
+ requirement: !ruby/object:Gem::Requirement
209
+ none: false
210
+ requirements:
211
+ - - ~>
212
+ - !ruby/object:Gem::Version
213
+ version: 2.4.0
214
+ type: :development
215
+ prerelease: false
216
+ version_requirements: !ruby/object:Gem::Requirement
217
+ none: false
218
+ requirements:
219
+ - - ~>
220
+ - !ruby/object:Gem::Version
221
+ version: 2.4.0
222
+ description: Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process
223
+ Common Crawl data.
224
+ email:
225
+ - ross@rossfairbanks.com
226
+ executables:
227
+ - elasticrawl
228
+ extensions: []
229
+ extra_rdoc_files: []
230
+ files:
231
+ - .gitignore
232
+ - .travis.yml
233
+ - Cheffile
234
+ - Cheffile.lock
235
+ - Gemfile
236
+ - LICENSE
237
+ - README.md
238
+ - Rakefile
239
+ - Vagrantfile
240
+ - bin/elasticrawl
241
+ - db/migrate/201401051536_create_crawls.rb
242
+ - db/migrate/201401051855_create_crawl_segments.rb
243
+ - db/migrate/201401101723_create_jobs.rb
244
+ - db/migrate/201401141606_create_job_steps.rb
245
+ - elasticrawl.gemspec
246
+ - lib/elasticrawl.rb
247
+ - lib/elasticrawl/cluster.rb
248
+ - lib/elasticrawl/combine_job.rb
249
+ - lib/elasticrawl/config.rb
250
+ - lib/elasticrawl/crawl.rb
251
+ - lib/elasticrawl/crawl_segment.rb
252
+ - lib/elasticrawl/error.rb
253
+ - lib/elasticrawl/job.rb
254
+ - lib/elasticrawl/job_step.rb
255
+ - lib/elasticrawl/parse_job.rb
256
+ - lib/elasticrawl/version.rb
257
+ - spec/fixtures/aws.yml
258
+ - spec/fixtures/cluster.yml
259
+ - spec/fixtures/jobs.yml
260
+ - spec/spec_helper.rb
261
+ - spec/unit/cluster_spec.rb
262
+ - spec/unit/combine_job_spec.rb
263
+ - spec/unit/config_spec.rb
264
+ - spec/unit/crawl_segment_spec.rb
265
+ - spec/unit/crawl_spec.rb
266
+ - spec/unit/job_spec.rb
267
+ - spec/unit/job_step_spec.rb
268
+ - spec/unit/parse_job_spec.rb
269
+ - templates/aws.yml
270
+ - templates/cluster.yml
271
+ - templates/jobs.yml
272
+ homepage: https://github.com/rossf7/elasticrawl
273
+ licenses:
274
+ - MIT
275
+ post_install_message:
276
+ rdoc_options: []
277
+ require_paths:
278
+ - lib
279
+ required_ruby_version: !ruby/object:Gem::Requirement
280
+ none: false
281
+ requirements:
282
+ - - ! '>='
283
+ - !ruby/object:Gem::Version
284
+ version: '0'
285
+ segments:
286
+ - 0
287
+ hash: -3344138865650739079
288
+ required_rubygems_version: !ruby/object:Gem::Requirement
289
+ none: false
290
+ requirements:
291
+ - - ! '>='
292
+ - !ruby/object:Gem::Version
293
+ version: '0'
294
+ segments:
295
+ - 0
296
+ hash: -3344138865650739079
297
+ requirements: []
298
+ rubyforge_project:
299
+ rubygems_version: 1.8.23
300
+ signing_key:
301
+ specification_version: 3
302
+ summary: Launch AWS Elastic MapReduce jobs that process Common Crawl data.
303
+ test_files:
304
+ - spec/fixtures/aws.yml
305
+ - spec/fixtures/cluster.yml
306
+ - spec/fixtures/jobs.yml
307
+ - spec/spec_helper.rb
308
+ - spec/unit/cluster_spec.rb
309
+ - spec/unit/combine_job_spec.rb
310
+ - spec/unit/config_spec.rb
311
+ - spec/unit/crawl_segment_spec.rb
312
+ - spec/unit/crawl_spec.rb
313
+ - spec/unit/job_spec.rb
314
+ - spec/unit/job_step_spec.rb
315
+ - spec/unit/parse_job_spec.rb