elasticrawl 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,130 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::ParseJob do
4
+ describe '#set_segments' do
5
+ let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
6
+ let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
7
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
8
+ let(:max_files) { 5 }
9
+ let(:parse_job) { Elasticrawl::ParseJob.new }
10
+
11
+ before do
12
+ crawl.create_segments
13
+ parse_job.set_segments(crawl.crawl_segments[0..1], max_files)
14
+ end
15
+
16
+ it 'should have a job name based on current time' do
17
+ expect(parse_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8)
18
+ end
19
+
20
+ it 'should have a job desc' do
21
+ expect(parse_job.job_desc).to eq job_desc
22
+ end
23
+
24
+ it 'should create 2 job steps' do
25
+ expect(parse_job.job_steps.count).to eq 2
26
+ end
27
+
28
+ it 'should set steps input path to segment uri' do
29
+ input_path = parse_job.job_steps[0].input_paths
30
+ segment_uri = crawl.crawl_segments[0].segment_s3_uri
31
+
32
+ expect(input_path.starts_with?(segment_uri)).to eq true
33
+ end
34
+
35
+ it 'should set output path' do
36
+ output_path = parse_job.job_steps[0].output_path
37
+ segment_name = crawl.crawl_segments[0].segment_name
38
+
39
+ expect(output_path.include?(parse_job.job_name)).to eq true
40
+ expect(output_path.include?(segment_name)).to eq true
41
+ end
42
+ end
43
+
44
+ describe '#confirm_message' do
45
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
46
+ let(:job) { Elasticrawl::ParseJob.new }
47
+ let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: 5 files per segment' }
48
+ let(:cluster_desc) {
49
+ cluster_desc = <<-HERE
50
+ Cluster configuration
51
+ Master: 1 m1.medium (Spot: 0.12)
52
+ Core: 2 m1.medium (Spot: 0.12)
53
+ Task: --
54
+ HERE
55
+ }
56
+
57
+ before do
58
+ crawl.create_segments
59
+ job.set_segments(crawl.crawl_segments[0..2], 5)
60
+ end
61
+
62
+ it 'should display message including job desc' do
63
+ expect(job.confirm_message.include?(job_desc)).to eq true
64
+ end
65
+
66
+ it 'should display message including cluster desc' do
67
+ expect(job.confirm_message.include?(cluster_desc)).to eq true
68
+ end
69
+ end
70
+
71
+ describe '#run' do
72
+ let(:crawl_name) { 'CC-MAIN-2013-20' }
73
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
74
+ let(:job) { Elasticrawl::ParseJob.new }
75
+ let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
76
+
77
+ before do
78
+ crawl.create_segments
79
+ job.set_segments(crawl.crawl_segments[0..1], 5)
80
+
81
+ Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
82
+ job.run
83
+ end
84
+
85
+ it 'should set a job flow id' do
86
+ expect(job.job_flow_id).to eq job_flow_id
87
+ end
88
+
89
+ it 'should set parse time for parsed segments' do
90
+ expect(crawl.crawl_segments[0].parse_time.present?).to eq true
91
+ expect(crawl.crawl_segments[1].parse_time.present?).to eq true
92
+ expect(crawl.crawl_segments[2].parse_time.present?).to eq false
93
+ end
94
+ end
95
+
96
+ describe '#log_uri' do
97
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
98
+ let(:job) { Elasticrawl::ParseJob.new }
99
+
100
+ before do
101
+ crawl.create_segments
102
+ job.set_segments(crawl.crawl_segments)
103
+ end
104
+
105
+ it 'should set a log uri including the job name' do
106
+ expect(job.log_uri).to eq "s3://elasticrawl/logs/1-parse/#{job.job_name}/"
107
+ end
108
+ end
109
+
110
+ describe '#history' do
111
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
112
+ let(:job) { Elasticrawl::ParseJob.new }
113
+ let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: all files' }
114
+ let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
115
+
116
+ before do
117
+ crawl.create_segments
118
+ job.set_segments(crawl.crawl_segments)
119
+
120
+ Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
121
+ job.run
122
+ end
123
+
124
+ it 'should return the job name, history and launch time' do
125
+ expect(job.history.include?(job.job_name)).to eq true
126
+ expect(job.history.include?(job.job_desc)).to eq true
127
+ expect(job.history.include?(job.created_at.strftime('%Y-%m-%d %H:%M:%S'))).to eq true
128
+ end
129
+ end
130
+ end
data/templates/aws.yml ADDED
@@ -0,0 +1,7 @@
1
+ # Configures the AWS access credentials used when calling the AWS
2
+ # Elastic MapReduce and S3 APIs. This file is populated by the init command.
3
+ #
4
+ # Instead of configuring this file you can set the environment variables
5
+ # AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.
6
+ access_key_id: 'ACCESS_KEY_ID'
7
+ secret_access_key: 'SECRET_ACCESS_KEY'
@@ -0,0 +1,44 @@
1
+ # Configures the Elastic MapReduce cluster that is launched to run parse and
2
+ # combine jobs. The list of EC2 instance types can be found at
3
+ # http://aws.amazon.com/ec2/instance-types/#instance-details
4
+
5
+ # Using spot instances is recommended to reduce costs. However if the spot
6
+ # price rises above your bid price the cluster may be terminated. Elasticrawl
7
+ # tries to reduce the effect of this by parsing each Commmon Crawl segment
8
+ # in a separate job flow step.
9
+
10
+ # The master node manages the cluster.
11
+ master_instance_group:
12
+ instance_type: m1.medium
13
+ use_spot_instances: true
14
+ bid_price: 0.120
15
+
16
+ # Core nodes run map and reduce tasks and store data using HDFS.
17
+ core_instance_group:
18
+ instance_type: m1.medium
19
+ instance_count: 2
20
+ use_spot_instances: true
21
+ bid_price: 0.120
22
+
23
+ # Task nodes are optional and only run map and reduce tasks.
24
+ task_instance_group:
25
+ instance_type: m1.small
26
+ instance_count: 0
27
+ use_spot_instances: true
28
+ bid_price: 0.080
29
+
30
+ # Array of bootstrap scripts that will be applied when the cluster nodes are
31
+ # initialized. The example installs the Ganglia distributed monitoring system.
32
+ bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia']
33
+
34
+ # Specifying an EC2 key pair allows SSH access to the master node. This also
35
+ # allows accessing the Hadoop Web UI over an SSH tunnel.
36
+ ec2_key_name: # 'key-pair-name'
37
+
38
+ # Availability Zone (AZ) to launch instances in. An AZ in the US-East region is
39
+ # recommended since the Common Crawl corpus is stored there. Otherwise inter
40
+ # region data transfer charges will apply.
41
+ placement: 'us-east-1a'
42
+
43
+ # The AMI version to use when launching instances.
44
+ emr_ami_version: 'latest'
@@ -0,0 +1,31 @@
1
+ # Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
2
+ # corpus.
3
+
4
+ # An S3 bucket is created by the init command and is used to store data and logs.
5
+ s3_bucket_name: 'BUCKET_NAME'
6
+
7
+ # A parse step is created per Common Crawl segment. A combine step takes the
8
+ # results from multiple segments to create a single set of output files.
9
+
10
+ # The parse input filter is used to specify the Common Crawl file type.
11
+
12
+ # WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
13
+ # WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
14
+ # WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.
15
+
16
+ # The EMR config is an XML file that sets Hadoop properties. If a config file
17
+ # is specified then a bootstrap action is run on each node to apply it.
18
+ steps:
19
+ # Parse step for the Example Elasticrawl JAR. This does a word count
20
+ # against the text extractions of the corpus.
21
+ parse:
22
+ jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
23
+ class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
24
+ input_filter: 'wet/*.warc.wet.gz'
25
+ emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
26
+ # Combine step for the Example Elasticrawl JAR.
27
+ combine:
28
+ jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
29
+ class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
30
+ input_filter: 'part-*'
31
+ emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'
metadata ADDED
@@ -0,0 +1,315 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: elasticrawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ross Fairbanks
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2014-02-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: activerecord
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 4.0.2
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 4.0.2
30
+ - !ruby/object:Gem::Dependency
31
+ name: activesupport
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 4.0.2
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 4.0.2
46
+ - !ruby/object:Gem::Dependency
47
+ name: aws-sdk
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: '1.0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '1.0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: elasticity
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ~>
68
+ - !ruby/object:Gem::Version
69
+ version: '2.7'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: '2.7'
78
+ - !ruby/object:Gem::Dependency
79
+ name: highline
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ~>
84
+ - !ruby/object:Gem::Version
85
+ version: 1.6.20
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: 1.6.20
94
+ - !ruby/object:Gem::Dependency
95
+ name: sqlite3
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ~>
100
+ - !ruby/object:Gem::Version
101
+ version: 1.3.8
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ~>
108
+ - !ruby/object:Gem::Version
109
+ version: 1.3.8
110
+ - !ruby/object:Gem::Dependency
111
+ name: thor
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: 0.18.1
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ~>
124
+ - !ruby/object:Gem::Version
125
+ version: 0.18.1
126
+ - !ruby/object:Gem::Dependency
127
+ name: rake
128
+ requirement: !ruby/object:Gem::Requirement
129
+ none: false
130
+ requirements:
131
+ - - ! '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ type: :development
135
+ prerelease: false
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
142
+ - !ruby/object:Gem::Dependency
143
+ name: bundler
144
+ requirement: !ruby/object:Gem::Requirement
145
+ none: false
146
+ requirements:
147
+ - - ~>
148
+ - !ruby/object:Gem::Version
149
+ version: '1.3'
150
+ type: :development
151
+ prerelease: false
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ~>
156
+ - !ruby/object:Gem::Version
157
+ version: '1.3'
158
+ - !ruby/object:Gem::Dependency
159
+ name: rspec
160
+ requirement: !ruby/object:Gem::Requirement
161
+ none: false
162
+ requirements:
163
+ - - ~>
164
+ - !ruby/object:Gem::Version
165
+ version: 2.14.1
166
+ type: :development
167
+ prerelease: false
168
+ version_requirements: !ruby/object:Gem::Requirement
169
+ none: false
170
+ requirements:
171
+ - - ~>
172
+ - !ruby/object:Gem::Version
173
+ version: 2.14.1
174
+ - !ruby/object:Gem::Dependency
175
+ name: mocha
176
+ requirement: !ruby/object:Gem::Requirement
177
+ none: false
178
+ requirements:
179
+ - - ~>
180
+ - !ruby/object:Gem::Version
181
+ version: 1.0.0
182
+ type: :development
183
+ prerelease: false
184
+ version_requirements: !ruby/object:Gem::Requirement
185
+ none: false
186
+ requirements:
187
+ - - ~>
188
+ - !ruby/object:Gem::Version
189
+ version: 1.0.0
190
+ - !ruby/object:Gem::Dependency
191
+ name: database_cleaner
192
+ requirement: !ruby/object:Gem::Requirement
193
+ none: false
194
+ requirements:
195
+ - - ~>
196
+ - !ruby/object:Gem::Version
197
+ version: 1.2.0
198
+ type: :development
199
+ prerelease: false
200
+ version_requirements: !ruby/object:Gem::Requirement
201
+ none: false
202
+ requirements:
203
+ - - ~>
204
+ - !ruby/object:Gem::Version
205
+ version: 1.2.0
206
+ - !ruby/object:Gem::Dependency
207
+ name: shoulda-matchers
208
+ requirement: !ruby/object:Gem::Requirement
209
+ none: false
210
+ requirements:
211
+ - - ~>
212
+ - !ruby/object:Gem::Version
213
+ version: 2.4.0
214
+ type: :development
215
+ prerelease: false
216
+ version_requirements: !ruby/object:Gem::Requirement
217
+ none: false
218
+ requirements:
219
+ - - ~>
220
+ - !ruby/object:Gem::Version
221
+ version: 2.4.0
222
+ description: Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process
223
+ Common Crawl data.
224
+ email:
225
+ - ross@rossfairbanks.com
226
+ executables:
227
+ - elasticrawl
228
+ extensions: []
229
+ extra_rdoc_files: []
230
+ files:
231
+ - .gitignore
232
+ - .travis.yml
233
+ - Cheffile
234
+ - Cheffile.lock
235
+ - Gemfile
236
+ - LICENSE
237
+ - README.md
238
+ - Rakefile
239
+ - Vagrantfile
240
+ - bin/elasticrawl
241
+ - db/migrate/201401051536_create_crawls.rb
242
+ - db/migrate/201401051855_create_crawl_segments.rb
243
+ - db/migrate/201401101723_create_jobs.rb
244
+ - db/migrate/201401141606_create_job_steps.rb
245
+ - elasticrawl.gemspec
246
+ - lib/elasticrawl.rb
247
+ - lib/elasticrawl/cluster.rb
248
+ - lib/elasticrawl/combine_job.rb
249
+ - lib/elasticrawl/config.rb
250
+ - lib/elasticrawl/crawl.rb
251
+ - lib/elasticrawl/crawl_segment.rb
252
+ - lib/elasticrawl/error.rb
253
+ - lib/elasticrawl/job.rb
254
+ - lib/elasticrawl/job_step.rb
255
+ - lib/elasticrawl/parse_job.rb
256
+ - lib/elasticrawl/version.rb
257
+ - spec/fixtures/aws.yml
258
+ - spec/fixtures/cluster.yml
259
+ - spec/fixtures/jobs.yml
260
+ - spec/spec_helper.rb
261
+ - spec/unit/cluster_spec.rb
262
+ - spec/unit/combine_job_spec.rb
263
+ - spec/unit/config_spec.rb
264
+ - spec/unit/crawl_segment_spec.rb
265
+ - spec/unit/crawl_spec.rb
266
+ - spec/unit/job_spec.rb
267
+ - spec/unit/job_step_spec.rb
268
+ - spec/unit/parse_job_spec.rb
269
+ - templates/aws.yml
270
+ - templates/cluster.yml
271
+ - templates/jobs.yml
272
+ homepage: https://github.com/rossf7/elasticrawl
273
+ licenses:
274
+ - MIT
275
+ post_install_message:
276
+ rdoc_options: []
277
+ require_paths:
278
+ - lib
279
+ required_ruby_version: !ruby/object:Gem::Requirement
280
+ none: false
281
+ requirements:
282
+ - - ! '>='
283
+ - !ruby/object:Gem::Version
284
+ version: '0'
285
+ segments:
286
+ - 0
287
+ hash: -3344138865650739079
288
+ required_rubygems_version: !ruby/object:Gem::Requirement
289
+ none: false
290
+ requirements:
291
+ - - ! '>='
292
+ - !ruby/object:Gem::Version
293
+ version: '0'
294
+ segments:
295
+ - 0
296
+ hash: -3344138865650739079
297
+ requirements: []
298
+ rubyforge_project:
299
+ rubygems_version: 1.8.23
300
+ signing_key:
301
+ specification_version: 3
302
+ summary: Launch AWS Elastic MapReduce jobs that process Common Crawl data.
303
+ test_files:
304
+ - spec/fixtures/aws.yml
305
+ - spec/fixtures/cluster.yml
306
+ - spec/fixtures/jobs.yml
307
+ - spec/spec_helper.rb
308
+ - spec/unit/cluster_spec.rb
309
+ - spec/unit/combine_job_spec.rb
310
+ - spec/unit/config_spec.rb
311
+ - spec/unit/crawl_segment_spec.rb
312
+ - spec/unit/crawl_spec.rb
313
+ - spec/unit/job_spec.rb
314
+ - spec/unit/job_step_spec.rb
315
+ - spec/unit/parse_job_spec.rb