elasticrawl 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +21 -0
- data/.travis.yml +5 -0
- data/Cheffile +14 -0
- data/Cheffile.lock +37 -0
- data/Gemfile +3 -0
- data/LICENSE +22 -0
- data/README.md +232 -0
- data/Rakefile +11 -0
- data/Vagrantfile +58 -0
- data/bin/elasticrawl +141 -0
- data/db/migrate/201401051536_create_crawls.rb +10 -0
- data/db/migrate/201401051855_create_crawl_segments.rb +14 -0
- data/db/migrate/201401101723_create_jobs.rb +14 -0
- data/db/migrate/201401141606_create_job_steps.rb +11 -0
- data/elasticrawl.gemspec +35 -0
- data/lib/elasticrawl/cluster.rb +128 -0
- data/lib/elasticrawl/combine_job.rb +86 -0
- data/lib/elasticrawl/config.rb +242 -0
- data/lib/elasticrawl/crawl.rb +114 -0
- data/lib/elasticrawl/crawl_segment.rb +8 -0
- data/lib/elasticrawl/error.rb +22 -0
- data/lib/elasticrawl/job.rb +68 -0
- data/lib/elasticrawl/job_step.rb +46 -0
- data/lib/elasticrawl/parse_job.rb +84 -0
- data/lib/elasticrawl/version.rb +3 -0
- data/lib/elasticrawl.rb +21 -0
- data/spec/fixtures/aws.yml +4 -0
- data/spec/fixtures/cluster.yml +44 -0
- data/spec/fixtures/jobs.yml +31 -0
- data/spec/spec_helper.rb +35 -0
- data/spec/unit/cluster_spec.rb +54 -0
- data/spec/unit/combine_job_spec.rb +97 -0
- data/spec/unit/config_spec.rb +17 -0
- data/spec/unit/crawl_segment_spec.rb +27 -0
- data/spec/unit/crawl_spec.rb +137 -0
- data/spec/unit/job_spec.rb +10 -0
- data/spec/unit/job_step_spec.rb +60 -0
- data/spec/unit/parse_job_spec.rb +130 -0
- data/templates/aws.yml +7 -0
- data/templates/cluster.yml +44 -0
- data/templates/jobs.yml +31 -0
- metadata +315 -0
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::ParseJob do
|
4
|
+
describe '#set_segments' do
|
5
|
+
let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
|
6
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
|
7
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
8
|
+
let(:max_files) { 5 }
|
9
|
+
let(:parse_job) { Elasticrawl::ParseJob.new }
|
10
|
+
|
11
|
+
before do
|
12
|
+
crawl.create_segments
|
13
|
+
parse_job.set_segments(crawl.crawl_segments[0..1], max_files)
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'should have a job name based on current time' do
|
17
|
+
expect(parse_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8)
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should have a job desc' do
|
21
|
+
expect(parse_job.job_desc).to eq job_desc
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should create 2 job steps' do
|
25
|
+
expect(parse_job.job_steps.count).to eq 2
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should set steps input path to segment uri' do
|
29
|
+
input_path = parse_job.job_steps[0].input_paths
|
30
|
+
segment_uri = crawl.crawl_segments[0].segment_s3_uri
|
31
|
+
|
32
|
+
expect(input_path.starts_with?(segment_uri)).to eq true
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should set output path' do
|
36
|
+
output_path = parse_job.job_steps[0].output_path
|
37
|
+
segment_name = crawl.crawl_segments[0].segment_name
|
38
|
+
|
39
|
+
expect(output_path.include?(parse_job.job_name)).to eq true
|
40
|
+
expect(output_path.include?(segment_name)).to eq true
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe '#confirm_message' do
|
45
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
46
|
+
let(:job) { Elasticrawl::ParseJob.new }
|
47
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: 5 files per segment' }
|
48
|
+
let(:cluster_desc) {
|
49
|
+
cluster_desc = <<-HERE
|
50
|
+
Cluster configuration
|
51
|
+
Master: 1 m1.medium (Spot: 0.12)
|
52
|
+
Core: 2 m1.medium (Spot: 0.12)
|
53
|
+
Task: --
|
54
|
+
HERE
|
55
|
+
}
|
56
|
+
|
57
|
+
before do
|
58
|
+
crawl.create_segments
|
59
|
+
job.set_segments(crawl.crawl_segments[0..2], 5)
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'should display message including job desc' do
|
63
|
+
expect(job.confirm_message.include?(job_desc)).to eq true
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'should display message including cluster desc' do
|
67
|
+
expect(job.confirm_message.include?(cluster_desc)).to eq true
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
describe '#run' do
|
72
|
+
let(:crawl_name) { 'CC-MAIN-2013-20' }
|
73
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
74
|
+
let(:job) { Elasticrawl::ParseJob.new }
|
75
|
+
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
76
|
+
|
77
|
+
before do
|
78
|
+
crawl.create_segments
|
79
|
+
job.set_segments(crawl.crawl_segments[0..1], 5)
|
80
|
+
|
81
|
+
Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
|
82
|
+
job.run
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'should set a job flow id' do
|
86
|
+
expect(job.job_flow_id).to eq job_flow_id
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'should set parse time for parsed segments' do
|
90
|
+
expect(crawl.crawl_segments[0].parse_time.present?).to eq true
|
91
|
+
expect(crawl.crawl_segments[1].parse_time.present?).to eq true
|
92
|
+
expect(crawl.crawl_segments[2].parse_time.present?).to eq false
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe '#log_uri' do
|
97
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
98
|
+
let(:job) { Elasticrawl::ParseJob.new }
|
99
|
+
|
100
|
+
before do
|
101
|
+
crawl.create_segments
|
102
|
+
job.set_segments(crawl.crawl_segments)
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'should set a log uri including the job name' do
|
106
|
+
expect(job.log_uri).to eq "s3://elasticrawl/logs/1-parse/#{job.job_name}/"
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
describe '#history' do
|
111
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
112
|
+
let(:job) { Elasticrawl::ParseJob.new }
|
113
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: all files' }
|
114
|
+
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
115
|
+
|
116
|
+
before do
|
117
|
+
crawl.create_segments
|
118
|
+
job.set_segments(crawl.crawl_segments)
|
119
|
+
|
120
|
+
Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
|
121
|
+
job.run
|
122
|
+
end
|
123
|
+
|
124
|
+
it 'should return the job name, history and launch time' do
|
125
|
+
expect(job.history.include?(job.job_name)).to eq true
|
126
|
+
expect(job.history.include?(job.job_desc)).to eq true
|
127
|
+
expect(job.history.include?(job.created_at.strftime('%Y-%m-%d %H:%M:%S'))).to eq true
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
data/templates/aws.yml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
# Configures the AWS access credentials used when calling the AWS
|
2
|
+
# Elastic MapReduce and S3 APIs. This file is populated by the init command.
|
3
|
+
#
|
4
|
+
# Instead of configuring this file you can set the environment variables
|
5
|
+
# AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.
|
6
|
+
access_key_id: 'ACCESS_KEY_ID'
|
7
|
+
secret_access_key: 'SECRET_ACCESS_KEY'
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# Configures the Elastic MapReduce cluster that is launched to run parse and
|
2
|
+
# combine jobs. The list of EC2 instance types can be found at
|
3
|
+
# http://aws.amazon.com/ec2/instance-types/#instance-details
|
4
|
+
|
5
|
+
# Using spot instances is recommended to reduce costs. However if the spot
|
6
|
+
# price rises above your bid price the cluster may be terminated. Elasticrawl
|
7
|
+
# tries to reduce the effect of this by parsing each Commmon Crawl segment
|
8
|
+
# in a separate job flow step.
|
9
|
+
|
10
|
+
# The master node manages the cluster.
|
11
|
+
master_instance_group:
|
12
|
+
instance_type: m1.medium
|
13
|
+
use_spot_instances: true
|
14
|
+
bid_price: 0.120
|
15
|
+
|
16
|
+
# Core nodes run map and reduce tasks and store data using HDFS.
|
17
|
+
core_instance_group:
|
18
|
+
instance_type: m1.medium
|
19
|
+
instance_count: 2
|
20
|
+
use_spot_instances: true
|
21
|
+
bid_price: 0.120
|
22
|
+
|
23
|
+
# Task nodes are optional and only run map and reduce tasks.
|
24
|
+
task_instance_group:
|
25
|
+
instance_type: m1.small
|
26
|
+
instance_count: 0
|
27
|
+
use_spot_instances: true
|
28
|
+
bid_price: 0.080
|
29
|
+
|
30
|
+
# Array of bootstrap scripts that will be applied when the cluster nodes are
|
31
|
+
# initialized. The example installs the Ganglia distributed monitoring system.
|
32
|
+
bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia']
|
33
|
+
|
34
|
+
# Specifying an EC2 key pair allows SSH access to the master node. This also
|
35
|
+
# allows accessing the Hadoop Web UI over an SSH tunnel.
|
36
|
+
ec2_key_name: # 'key-pair-name'
|
37
|
+
|
38
|
+
# Availability Zone (AZ) to launch instances in. An AZ in the US-East region is
|
39
|
+
# recommended since the Common Crawl corpus is stored there. Otherwise inter
|
40
|
+
# region data transfer charges will apply.
|
41
|
+
placement: 'us-east-1a'
|
42
|
+
|
43
|
+
# The AMI version to use when launching instances.
|
44
|
+
emr_ami_version: 'latest'
|
data/templates/jobs.yml
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
|
2
|
+
# corpus.
|
3
|
+
|
4
|
+
# An S3 bucket is created by the init command and is used to store data and logs.
|
5
|
+
s3_bucket_name: 'BUCKET_NAME'
|
6
|
+
|
7
|
+
# A parse step is created per Common Crawl segment. A combine step takes the
|
8
|
+
# results from multiple segments to create a single set of output files.
|
9
|
+
|
10
|
+
# The parse input filter is used to specify the Common Crawl file type.
|
11
|
+
|
12
|
+
# WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
|
13
|
+
# WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
|
14
|
+
# WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.
|
15
|
+
|
16
|
+
# The EMR config is an XML file that sets Hadoop properties. If a config file
|
17
|
+
# is specified then a bootstrap action is run on each node to apply it.
|
18
|
+
steps:
|
19
|
+
# Parse step for the Example Elasticrawl JAR. This does a word count
|
20
|
+
# against the text extractions of the corpus.
|
21
|
+
parse:
|
22
|
+
jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
|
23
|
+
class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
|
24
|
+
input_filter: 'wet/*.warc.wet.gz'
|
25
|
+
emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
|
26
|
+
# Combine step for the Example Elasticrawl JAR.
|
27
|
+
combine:
|
28
|
+
jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
|
29
|
+
class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
|
30
|
+
input_filter: 'part-*'
|
31
|
+
emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'
|
metadata
ADDED
@@ -0,0 +1,315 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: elasticrawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Ross Fairbanks
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-02-04 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: activerecord
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 4.0.2
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 4.0.2
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: activesupport
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: 4.0.2
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 4.0.2
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: aws-sdk
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '1.0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: elasticity
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ~>
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '2.7'
|
70
|
+
type: :runtime
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '2.7'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: highline
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ~>
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: 1.6.20
|
86
|
+
type: :runtime
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 1.6.20
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: sqlite3
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ~>
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 1.3.8
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ~>
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 1.3.8
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: thor
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ~>
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: 0.18.1
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ~>
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: 0.18.1
|
126
|
+
- !ruby/object:Gem::Dependency
|
127
|
+
name: rake
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
130
|
+
requirements:
|
131
|
+
- - ! '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
type: :development
|
135
|
+
prerelease: false
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
- !ruby/object:Gem::Dependency
|
143
|
+
name: bundler
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
146
|
+
requirements:
|
147
|
+
- - ~>
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '1.3'
|
150
|
+
type: :development
|
151
|
+
prerelease: false
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - ~>
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '1.3'
|
158
|
+
- !ruby/object:Gem::Dependency
|
159
|
+
name: rspec
|
160
|
+
requirement: !ruby/object:Gem::Requirement
|
161
|
+
none: false
|
162
|
+
requirements:
|
163
|
+
- - ~>
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: 2.14.1
|
166
|
+
type: :development
|
167
|
+
prerelease: false
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ~>
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: 2.14.1
|
174
|
+
- !ruby/object:Gem::Dependency
|
175
|
+
name: mocha
|
176
|
+
requirement: !ruby/object:Gem::Requirement
|
177
|
+
none: false
|
178
|
+
requirements:
|
179
|
+
- - ~>
|
180
|
+
- !ruby/object:Gem::Version
|
181
|
+
version: 1.0.0
|
182
|
+
type: :development
|
183
|
+
prerelease: false
|
184
|
+
version_requirements: !ruby/object:Gem::Requirement
|
185
|
+
none: false
|
186
|
+
requirements:
|
187
|
+
- - ~>
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
version: 1.0.0
|
190
|
+
- !ruby/object:Gem::Dependency
|
191
|
+
name: database_cleaner
|
192
|
+
requirement: !ruby/object:Gem::Requirement
|
193
|
+
none: false
|
194
|
+
requirements:
|
195
|
+
- - ~>
|
196
|
+
- !ruby/object:Gem::Version
|
197
|
+
version: 1.2.0
|
198
|
+
type: :development
|
199
|
+
prerelease: false
|
200
|
+
version_requirements: !ruby/object:Gem::Requirement
|
201
|
+
none: false
|
202
|
+
requirements:
|
203
|
+
- - ~>
|
204
|
+
- !ruby/object:Gem::Version
|
205
|
+
version: 1.2.0
|
206
|
+
- !ruby/object:Gem::Dependency
|
207
|
+
name: shoulda-matchers
|
208
|
+
requirement: !ruby/object:Gem::Requirement
|
209
|
+
none: false
|
210
|
+
requirements:
|
211
|
+
- - ~>
|
212
|
+
- !ruby/object:Gem::Version
|
213
|
+
version: 2.4.0
|
214
|
+
type: :development
|
215
|
+
prerelease: false
|
216
|
+
version_requirements: !ruby/object:Gem::Requirement
|
217
|
+
none: false
|
218
|
+
requirements:
|
219
|
+
- - ~>
|
220
|
+
- !ruby/object:Gem::Version
|
221
|
+
version: 2.4.0
|
222
|
+
description: Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process
|
223
|
+
Common Crawl data.
|
224
|
+
email:
|
225
|
+
- ross@rossfairbanks.com
|
226
|
+
executables:
|
227
|
+
- elasticrawl
|
228
|
+
extensions: []
|
229
|
+
extra_rdoc_files: []
|
230
|
+
files:
|
231
|
+
- .gitignore
|
232
|
+
- .travis.yml
|
233
|
+
- Cheffile
|
234
|
+
- Cheffile.lock
|
235
|
+
- Gemfile
|
236
|
+
- LICENSE
|
237
|
+
- README.md
|
238
|
+
- Rakefile
|
239
|
+
- Vagrantfile
|
240
|
+
- bin/elasticrawl
|
241
|
+
- db/migrate/201401051536_create_crawls.rb
|
242
|
+
- db/migrate/201401051855_create_crawl_segments.rb
|
243
|
+
- db/migrate/201401101723_create_jobs.rb
|
244
|
+
- db/migrate/201401141606_create_job_steps.rb
|
245
|
+
- elasticrawl.gemspec
|
246
|
+
- lib/elasticrawl.rb
|
247
|
+
- lib/elasticrawl/cluster.rb
|
248
|
+
- lib/elasticrawl/combine_job.rb
|
249
|
+
- lib/elasticrawl/config.rb
|
250
|
+
- lib/elasticrawl/crawl.rb
|
251
|
+
- lib/elasticrawl/crawl_segment.rb
|
252
|
+
- lib/elasticrawl/error.rb
|
253
|
+
- lib/elasticrawl/job.rb
|
254
|
+
- lib/elasticrawl/job_step.rb
|
255
|
+
- lib/elasticrawl/parse_job.rb
|
256
|
+
- lib/elasticrawl/version.rb
|
257
|
+
- spec/fixtures/aws.yml
|
258
|
+
- spec/fixtures/cluster.yml
|
259
|
+
- spec/fixtures/jobs.yml
|
260
|
+
- spec/spec_helper.rb
|
261
|
+
- spec/unit/cluster_spec.rb
|
262
|
+
- spec/unit/combine_job_spec.rb
|
263
|
+
- spec/unit/config_spec.rb
|
264
|
+
- spec/unit/crawl_segment_spec.rb
|
265
|
+
- spec/unit/crawl_spec.rb
|
266
|
+
- spec/unit/job_spec.rb
|
267
|
+
- spec/unit/job_step_spec.rb
|
268
|
+
- spec/unit/parse_job_spec.rb
|
269
|
+
- templates/aws.yml
|
270
|
+
- templates/cluster.yml
|
271
|
+
- templates/jobs.yml
|
272
|
+
homepage: https://github.com/rossf7/elasticrawl
|
273
|
+
licenses:
|
274
|
+
- MIT
|
275
|
+
post_install_message:
|
276
|
+
rdoc_options: []
|
277
|
+
require_paths:
|
278
|
+
- lib
|
279
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
280
|
+
none: false
|
281
|
+
requirements:
|
282
|
+
- - ! '>='
|
283
|
+
- !ruby/object:Gem::Version
|
284
|
+
version: '0'
|
285
|
+
segments:
|
286
|
+
- 0
|
287
|
+
hash: -3344138865650739079
|
288
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
289
|
+
none: false
|
290
|
+
requirements:
|
291
|
+
- - ! '>='
|
292
|
+
- !ruby/object:Gem::Version
|
293
|
+
version: '0'
|
294
|
+
segments:
|
295
|
+
- 0
|
296
|
+
hash: -3344138865650739079
|
297
|
+
requirements: []
|
298
|
+
rubyforge_project:
|
299
|
+
rubygems_version: 1.8.23
|
300
|
+
signing_key:
|
301
|
+
specification_version: 3
|
302
|
+
summary: Launch AWS Elastic MapReduce jobs that process Common Crawl data.
|
303
|
+
test_files:
|
304
|
+
- spec/fixtures/aws.yml
|
305
|
+
- spec/fixtures/cluster.yml
|
306
|
+
- spec/fixtures/jobs.yml
|
307
|
+
- spec/spec_helper.rb
|
308
|
+
- spec/unit/cluster_spec.rb
|
309
|
+
- spec/unit/combine_job_spec.rb
|
310
|
+
- spec/unit/config_spec.rb
|
311
|
+
- spec/unit/crawl_segment_spec.rb
|
312
|
+
- spec/unit/crawl_spec.rb
|
313
|
+
- spec/unit/job_spec.rb
|
314
|
+
- spec/unit/job_step_spec.rb
|
315
|
+
- spec/unit/parse_job_spec.rb
|