elasticrawl 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +21 -0
- data/.travis.yml +5 -0
- data/Cheffile +14 -0
- data/Cheffile.lock +37 -0
- data/Gemfile +3 -0
- data/LICENSE +22 -0
- data/README.md +232 -0
- data/Rakefile +11 -0
- data/Vagrantfile +58 -0
- data/bin/elasticrawl +141 -0
- data/db/migrate/201401051536_create_crawls.rb +10 -0
- data/db/migrate/201401051855_create_crawl_segments.rb +14 -0
- data/db/migrate/201401101723_create_jobs.rb +14 -0
- data/db/migrate/201401141606_create_job_steps.rb +11 -0
- data/elasticrawl.gemspec +35 -0
- data/lib/elasticrawl/cluster.rb +128 -0
- data/lib/elasticrawl/combine_job.rb +86 -0
- data/lib/elasticrawl/config.rb +242 -0
- data/lib/elasticrawl/crawl.rb +114 -0
- data/lib/elasticrawl/crawl_segment.rb +8 -0
- data/lib/elasticrawl/error.rb +22 -0
- data/lib/elasticrawl/job.rb +68 -0
- data/lib/elasticrawl/job_step.rb +46 -0
- data/lib/elasticrawl/parse_job.rb +84 -0
- data/lib/elasticrawl/version.rb +3 -0
- data/lib/elasticrawl.rb +21 -0
- data/spec/fixtures/aws.yml +4 -0
- data/spec/fixtures/cluster.yml +44 -0
- data/spec/fixtures/jobs.yml +31 -0
- data/spec/spec_helper.rb +35 -0
- data/spec/unit/cluster_spec.rb +54 -0
- data/spec/unit/combine_job_spec.rb +97 -0
- data/spec/unit/config_spec.rb +17 -0
- data/spec/unit/crawl_segment_spec.rb +27 -0
- data/spec/unit/crawl_spec.rb +137 -0
- data/spec/unit/job_spec.rb +10 -0
- data/spec/unit/job_step_spec.rb +60 -0
- data/spec/unit/parse_job_spec.rb +130 -0
- data/templates/aws.yml +7 -0
- data/templates/cluster.yml +44 -0
- data/templates/jobs.yml +31 -0
- metadata +315 -0
data/.gitignore
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
|
19
|
+
.vagrant
|
20
|
+
cookbooks
|
21
|
+
spec/fixtures/elasticrawl.sqlite3
|
data/.travis.yml
ADDED
data/Cheffile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#^syntax detection
|
3
|
+
|
4
|
+
site "http://community.opscode.com/api/v1"
|
5
|
+
|
6
|
+
cookbook "apt",
|
7
|
+
:version => "1.7.0"
|
8
|
+
cookbook "build-essential"
|
9
|
+
cookbook "git"
|
10
|
+
cookbook "rbenv",
|
11
|
+
:git => "https://github.com/fnichol/chef-rbenv.git",
|
12
|
+
:ref => "v0.7.2"
|
13
|
+
cookbook "ruby_build"
|
14
|
+
cookbook "vim"
|
data/Cheffile.lock
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
SITE
|
2
|
+
remote: http://community.opscode.com/api/v1
|
3
|
+
specs:
|
4
|
+
apt (2.2.1)
|
5
|
+
build-essential (1.4.2)
|
6
|
+
chef_handler (1.1.4)
|
7
|
+
dmg (2.0.4)
|
8
|
+
git (2.7.0)
|
9
|
+
build-essential (>= 0.0.0)
|
10
|
+
dmg (>= 0.0.0)
|
11
|
+
runit (>= 1.0.0)
|
12
|
+
windows (>= 0.0.0)
|
13
|
+
yum (>= 0.0.0)
|
14
|
+
ruby_build (0.8.0)
|
15
|
+
runit (1.3.0)
|
16
|
+
build-essential (>= 0.0.0)
|
17
|
+
yum (>= 0.0.0)
|
18
|
+
vim (1.0.2)
|
19
|
+
windows (1.11.0)
|
20
|
+
chef_handler (>= 0.0.0)
|
21
|
+
yum (2.3.4)
|
22
|
+
|
23
|
+
GIT
|
24
|
+
remote: https://github.com/fnichol/chef-rbenv.git
|
25
|
+
ref: v0.7.2
|
26
|
+
sha: f2b53292e810dd2b43f6121f9958f5f29979dcb1
|
27
|
+
specs:
|
28
|
+
rbenv (0.7.2)
|
29
|
+
|
30
|
+
DEPENDENCIES
|
31
|
+
apt (>= 0)
|
32
|
+
build-essential (>= 0)
|
33
|
+
git (>= 0)
|
34
|
+
rbenv (>= 0)
|
35
|
+
ruby_build (>= 0)
|
36
|
+
vim (>= 0)
|
37
|
+
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Ross Fairbanks
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,232 @@
|
|
1
|
+
# Elasticrawl
|
2
|
+
|
3
|
+
Launch AWS Elastic MapReduce jobs that process Common Crawl data.
|
4
|
+
Elasticrawl works with the latest Common Crawl data structure and file formats
|
5
|
+
([2013 data onwards](http://commoncrawl.org/new-crawl-data-available/)).
|
6
|
+
Ships with a default configuration that launches the
|
7
|
+
[elasticrawl-examples](https://github.com/rossf7/elasticrawl-examples) jobs.
|
8
|
+
This is an implementation of the standard Hadoop Word Count example.
|
9
|
+
|
10
|
+
## Overview
|
11
|
+
|
12
|
+
Common Crawl have released 2 web crawls of 2013 data. Further crawls will be released
|
13
|
+
during 2014. Each crawl is split into multiple segments that contain 3 file types.
|
14
|
+
|
15
|
+
* WARC - WARC files with the HTTP request and response for each fetch
|
16
|
+
* WAT - WARC encoded files containing JSON metadata
|
17
|
+
* WET - WARC encoded text extractions of the HTTP responses
|
18
|
+
|
19
|
+
| Crawl Name | Date | Segments | Pages | Size (uncompressed) |
|
20
|
+
| -------------- |:--------:|:--------:|:-------------:|:-------------------:|
|
21
|
+
| CC-MAIN-2013-48| Nov 2013 | 517 | ~ 2.3 billion | 148 TB |
|
22
|
+
| CC-MAIN-2013-20| May 2013 | 316 | ~ 2.0 billion | 102 TB |
|
23
|
+
|
24
|
+
Elasticrawl is a command line tool that automates launching Elastic MapReduce
|
25
|
+
jobs against this data.
|
26
|
+
|
27
|
+
[![Code Climate](https://codeclimate.com/github/rossf7/elasticrawl.png)](https://codeclimate.com/github/rossf7/elasticrawl)
|
28
|
+
[![Build Status](https://travis-ci.org/rossf7/elasticrawl.png?branch=master)](https://travis-ci.org/rossf7/elasticrawl) 1.9.3, 2.0.0, 2.1.0
|
29
|
+
|
30
|
+
## Installation
|
31
|
+
|
32
|
+
### Dependencies
|
33
|
+
|
34
|
+
Elasticrawl is developed in Ruby and requires Ruby 1.9.3 or later.
|
35
|
+
Installing using [rbenv](https://github.com/sstephenson/rbenv#installation)
|
36
|
+
and the ruby-build plugin is recommended.
|
37
|
+
|
38
|
+
### Install elasticrawl
|
39
|
+
|
40
|
+
```bash
|
41
|
+
~$ gem install elasticrawl --no-rdoc --no-ri
|
42
|
+
```
|
43
|
+
|
44
|
+
If you're using rbenv you need to do a rehash to add the elasticrawl executable
|
45
|
+
to your path.
|
46
|
+
|
47
|
+
```bash
|
48
|
+
~$ rbenv rehash
|
49
|
+
```
|
50
|
+
|
51
|
+
## Quick Start
|
52
|
+
|
53
|
+
In this example you'll launch 2 EMR jobs against a small portion of the Nov
|
54
|
+
2013 crawl. Each job will take around 20 minutes to run. Most of this is setup
|
55
|
+
time while your EC2 spot instances are provisioned and your Hadoop cluster is
|
56
|
+
configured.
|
57
|
+
|
58
|
+
You'll need to have an [AWS account](https://portal.aws.amazon.com/gp/aws/developer/registration/index.html)
|
59
|
+
to use elasticrawl. The total cost of the 2 EMR jobs will be under $1 USD.
|
60
|
+
|
61
|
+
### Setup
|
62
|
+
|
63
|
+
You'll need to choose an S3 bucket name and enter your AWS access key and
|
64
|
+
secret key. The S3 bucket will be used for storing data and logs. S3 bucket
|
65
|
+
names must be unique, using hyphens rather than underscores is recommended.
|
66
|
+
|
67
|
+
```bash
|
68
|
+
~$ elasticrawl init your-s3-bucket
|
69
|
+
|
70
|
+
Enter AWS Access Key ID: ************
|
71
|
+
Enter AWS Secret Access Key: ************
|
72
|
+
|
73
|
+
...
|
74
|
+
|
75
|
+
Bucket s3://elasticrawl-test created
|
76
|
+
Config dir /Users/ross/.elasticrawl created
|
77
|
+
Config complete
|
78
|
+
```
|
79
|
+
|
80
|
+
### Parse Job
|
81
|
+
|
82
|
+
For this example you'll parse the first 2 WET files in the first 2 segments
|
83
|
+
of the Nov 2013 crawl.
|
84
|
+
|
85
|
+
```bash
|
86
|
+
~$ elasticrawl parse CC-MAIN-2013-48 --max-segments 2 --max-files 2
|
87
|
+
|
88
|
+
Job configuration
|
89
|
+
Crawl: CC-MAIN-2013-48 Segments: 2 Parsing: 2 files per segment
|
90
|
+
|
91
|
+
Cluster configuration
|
92
|
+
Master: 1 m1.medium (Spot: 0.12)
|
93
|
+
Core: 2 m1.medium (Spot: 0.12)
|
94
|
+
Task: --
|
95
|
+
Launch job? (y/n)
|
96
|
+
|
97
|
+
y
|
98
|
+
Job Name: 1391458746774 Job Flow ID: j-2X9JVDC1UKEQ1
|
99
|
+
```
|
100
|
+
|
101
|
+
You can monitor the progress of your job in the Elastic MapReduce section
|
102
|
+
of the AWS web console.
|
103
|
+
|
104
|
+
### Combine Job
|
105
|
+
|
106
|
+
The combine job will aggregate the word count results from both segments into
|
107
|
+
a single set of files.
|
108
|
+
|
109
|
+
```bash
|
110
|
+
~$ elasticrawl combine --input-jobs 1391458746774
|
111
|
+
|
112
|
+
Job configuration
|
113
|
+
Combining: 2 segments
|
114
|
+
|
115
|
+
Cluster configuration
|
116
|
+
Master: 1 m1.medium (Spot: 0.12)
|
117
|
+
Core: 2 m1.medium (Spot: 0.12)
|
118
|
+
Task: --
|
119
|
+
Launch job? (y/n)
|
120
|
+
|
121
|
+
y
|
122
|
+
Job Name: 1391459918730 Job Flow ID: j-GTJ2M7D1TXO6
|
123
|
+
```
|
124
|
+
|
125
|
+
Once the combine job is complete you can download your results from the
|
126
|
+
S3 section of the AWS web console. Your data will be stored in
|
127
|
+
|
128
|
+
[your S3 bucket]/data/2-combine/[job name]
|
129
|
+
|
130
|
+
### Cleaning Up
|
131
|
+
|
132
|
+
You'll be charged by AWS for any data stored in your S3 bucket. The destroy
|
133
|
+
command deletes your S3 bucket and the ~/.elasticrawl/ directory.
|
134
|
+
|
135
|
+
```bash
|
136
|
+
~$ elasticrawl destroy
|
137
|
+
|
138
|
+
WARNING:
|
139
|
+
Bucket s3://elasticrawl-test and its data will be deleted
|
140
|
+
Config dir /home/vagrant/.elasticrawl will be deleted
|
141
|
+
Delete? (y/n)
|
142
|
+
y
|
143
|
+
|
144
|
+
Bucket s3://elasticrawl-test deleted
|
145
|
+
Config dir /home/vagrant/.elasticrawl deleted
|
146
|
+
Config deleted
|
147
|
+
```
|
148
|
+
|
149
|
+
## Configuring Elasticrawl
|
150
|
+
|
151
|
+
The elasticrawl init command creates the ~/elasticrawl/ directory which
|
152
|
+
contains
|
153
|
+
|
154
|
+
* [aws.yml](https://github.com/rossf7/elasticrawl/blob/master/templates/aws.yml) -
|
155
|
+
stores your AWS access credentials. Or you can set the environment
|
156
|
+
variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
|
157
|
+
|
158
|
+
* [cluster.yml](https://github.com/rossf7/elasticrawl/blob/master/templates/cluster.yml) -
|
159
|
+
configures the EC2 instances that are launched to form your EMR cluster
|
160
|
+
|
161
|
+
* [jobs.yml](https://github.com/rossf7/elasticrawl/blob/master/templates/jobs.yml) -
|
162
|
+
stores your S3 bucket name and the config for the parse and combine jobs
|
163
|
+
|
164
|
+
## Managing Segments
|
165
|
+
|
166
|
+
Each Common Crawl segment is parsed as a separate EMR job step. This avoids
|
167
|
+
overloading the job tracker and means if a job fails then only data from the
|
168
|
+
current segment is lost. However an EMR job flow can only contain 256 steps.
|
169
|
+
So to process an entire crawl multiple parse jobs must be combined.
|
170
|
+
|
171
|
+
```bash
|
172
|
+
~$ elasticrawl combine --input-jobs 1391430796774 1391458746774 1391498046704
|
173
|
+
```
|
174
|
+
|
175
|
+
You can use the status command to see details of crawls and jobs.
|
176
|
+
|
177
|
+
```bash
|
178
|
+
~$ elasticrawl status
|
179
|
+
|
180
|
+
Crawl Status
|
181
|
+
CC-MAIN-2013-48 Segments: to parse 517, parsed 2, total 519
|
182
|
+
|
183
|
+
Job History (last 10)
|
184
|
+
1391459918730 Launched: 2014-02-04 13:58:12 Combining: 2 segments
|
185
|
+
1391458746774 Launched: 2014-02-04 13:55:50 Crawl: CC-MAIN-2013-48 Segments: 2 Parsing: 2 files per segment
|
186
|
+
```
|
187
|
+
|
188
|
+
You can use the reset command to parse a crawl again.
|
189
|
+
|
190
|
+
```bash
|
191
|
+
~$ elasticrawl reset CC-MAIN-2013-48
|
192
|
+
|
193
|
+
Reset crawl? (y/n)
|
194
|
+
y
|
195
|
+
CC-MAIN-2013-48 Segments: to parse 519, parsed 0, total 519
|
196
|
+
```
|
197
|
+
|
198
|
+
To parse the same segments multiple times.
|
199
|
+
|
200
|
+
```bash
|
201
|
+
~$ elasticrawl parse CC-MAIN-2013-48 --segment-list 1386163036037 1386163035819 --max-files 2
|
202
|
+
```
|
203
|
+
|
204
|
+
## Running your own Jobs
|
205
|
+
|
206
|
+
1. Fork the [elasticrawl-examples](https://github.com/rossf7/elasticrawl-examples)
|
207
|
+
2. Make your changes
|
208
|
+
3. Compile your changes into a JAR using Maven
|
209
|
+
4. Upload your JAR to your own S3 bucket
|
210
|
+
5. Edit ~/.elasticrawl/jobs.yml with your JAR and class names
|
211
|
+
|
212
|
+
## TODO
|
213
|
+
|
214
|
+
* Add support for Streaming and Pig jobs
|
215
|
+
|
216
|
+
## Thanks
|
217
|
+
|
218
|
+
* Thanks to everyone at Common Crawl for making this awesome dataset available.
|
219
|
+
* Thanks to Robert Slifka for the [elasticity](https://github.com/rslifka/elasticity)
|
220
|
+
gem which provides a nice Ruby wrapper for the EMR REST API.
|
221
|
+
|
222
|
+
## Contributing
|
223
|
+
|
224
|
+
1. Fork it
|
225
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
226
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
227
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
228
|
+
5. Create new Pull Request
|
229
|
+
|
230
|
+
## License
|
231
|
+
|
232
|
+
This code is licensed under the MIT license.
|
data/Rakefile
ADDED
data/Vagrantfile
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
# -*- mode: ruby -*-
|
2
|
+
# vi: set ft=ruby :
|
3
|
+
|
4
|
+
Vagrant.configure("2") do |config|
|
5
|
+
# All Vagrant configuration is done here. The most common configuration
|
6
|
+
# options are documented and commented below. For a complete reference,
|
7
|
+
# please see the online documentation at vagrantup.com.
|
8
|
+
|
9
|
+
# Fix DNS issues with Ubuntu 12.04 by always using host's resolver
|
10
|
+
config.vm.provider "virtualbox" do |vbox|
|
11
|
+
vbox.customize ["modifyvm", :id, "--natdnshostresolver1", "on"]
|
12
|
+
end
|
13
|
+
|
14
|
+
# Elasticrawl launches Hadoop jobs for the CommonCrawl dataset using the AWS EMR service.
|
15
|
+
config.vm.define :elasticrawl do |elasticrawl|
|
16
|
+
elasticrawl.vm.box = "elasticrawl"
|
17
|
+
|
18
|
+
# Ubuntu Server 12.04 LTS
|
19
|
+
elasticrawl.vm.box_url = "http://files.vagrantup.com/precise64.box"
|
20
|
+
|
21
|
+
# Network config
|
22
|
+
elasticrawl.vm.network :public_network
|
23
|
+
|
24
|
+
# Provision using Chef Solo
|
25
|
+
elasticrawl.vm.provision "chef_solo" do |chef|
|
26
|
+
chef.cookbooks_path = "cookbooks"
|
27
|
+
chef.add_recipe "apt"
|
28
|
+
chef.add_recipe "build-essential"
|
29
|
+
chef.add_recipe "ruby_build"
|
30
|
+
chef.add_recipe "rbenv::user"
|
31
|
+
chef.add_recipe "git"
|
32
|
+
chef.add_recipe "vim"
|
33
|
+
|
34
|
+
chef.json = {
|
35
|
+
"rbenv" => {
|
36
|
+
"user_installs" => [
|
37
|
+
{
|
38
|
+
"user" => "vagrant",
|
39
|
+
"rubies" => ["1.9.3-p484", "2.0.0-p353", "2.1.0"],
|
40
|
+
"global" => "1.9.3-p484",
|
41
|
+
"gems" => {
|
42
|
+
"1.9.3-p484" => [
|
43
|
+
{ "name" => "bundler" }
|
44
|
+
],
|
45
|
+
"2.0.0-p353" => [
|
46
|
+
{ "name" => "bundler" }
|
47
|
+
],
|
48
|
+
"2.1.0" => [
|
49
|
+
{ "name" => "bundler" }
|
50
|
+
]
|
51
|
+
}
|
52
|
+
}
|
53
|
+
]
|
54
|
+
}
|
55
|
+
}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
data/bin/elasticrawl
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'elasticrawl'
|
3
|
+
|
4
|
+
module Elasticrawl
|
5
|
+
class Cli < Thor
|
6
|
+
desc 'init S3_BUCKET_NAME', 'Creates S3 bucket and config directory'
|
7
|
+
method_option :access_key_id, :type => :string, :desc => 'AWS Access Key ID'
|
8
|
+
method_option :secret_access_key, :type => :string, :desc => 'AWS Secret Access Key'
|
9
|
+
def init(s3_bucket_name)
|
10
|
+
key = options[:access_key_id]
|
11
|
+
secret = options[:secret_access_key]
|
12
|
+
|
13
|
+
if key.nil? || secret.nil?
|
14
|
+
config = Config.new
|
15
|
+
|
16
|
+
# Prompt for credentials showing the current values.
|
17
|
+
key = ask(config.access_key_prompt)
|
18
|
+
secret = ask(config.secret_key_prompt)
|
19
|
+
|
20
|
+
# Use current values if user has selected them.
|
21
|
+
key = config.access_key_id if key.blank?
|
22
|
+
secret = config.secret_access_key if secret.blank?
|
23
|
+
end
|
24
|
+
|
25
|
+
# Create new config object with updated credentials.
|
26
|
+
config = Config.new(key, secret)
|
27
|
+
|
28
|
+
if config.bucket_exists?(s3_bucket_name)
|
29
|
+
puts('ERROR: S3 bucket already exists')
|
30
|
+
else
|
31
|
+
if config.dir_exists?
|
32
|
+
puts("WARNING: Config dir #{config.config_dir} already exists")
|
33
|
+
overwrite = agree('Overwrite? (y/n)', true)
|
34
|
+
end
|
35
|
+
|
36
|
+
puts(config.create(s3_bucket_name)) if !config.dir_exists? || overwrite == true
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
desc 'parse CRAWL_NAME', 'Launches parse job against Common Crawl corpus'
|
41
|
+
method_option :max_segments, :type => :numeric, :desc => 'number of crawl segments to parse'
|
42
|
+
method_option :max_files, :type => :numeric, :desc => 'number of files to parse per segment'
|
43
|
+
method_option :segment_list, :type => :array, :desc => 'list of segment names to parse'
|
44
|
+
def parse(crawl_name)
|
45
|
+
load_database
|
46
|
+
|
47
|
+
crawl = find_crawl(crawl_name)
|
48
|
+
if crawl.has_segments?
|
49
|
+
segment_list = options[:segment_list]
|
50
|
+
|
51
|
+
if segment_list.present?
|
52
|
+
segments = crawl.select_segments(segment_list)
|
53
|
+
else
|
54
|
+
segments = crawl.next_segments(options[:max_segments])
|
55
|
+
end
|
56
|
+
|
57
|
+
if segments.count == 0
|
58
|
+
puts('ERROR: No segments matched for parsing')
|
59
|
+
else
|
60
|
+
job = ParseJob.new
|
61
|
+
job.set_segments(segments, options[:max_files])
|
62
|
+
puts(job.confirm_message)
|
63
|
+
|
64
|
+
launch = agree('Launch job? (y/n)', true)
|
65
|
+
puts(job.run) if launch == true
|
66
|
+
end
|
67
|
+
else
|
68
|
+
puts('ERROR: Crawl does not exist')
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
desc 'combine', 'Launches combine job against parse job results'
|
73
|
+
method_option :input_jobs, :type => :array, :required => true,
|
74
|
+
:desc => 'list of input jobs to combine'
|
75
|
+
def combine
|
76
|
+
load_database
|
77
|
+
|
78
|
+
job = CombineJob.new
|
79
|
+
job.set_input_jobs(options[:input_jobs])
|
80
|
+
puts(job.confirm_message)
|
81
|
+
|
82
|
+
launch = agree('Launch job? (y/n)', true)
|
83
|
+
puts(job.run) if launch == true
|
84
|
+
end
|
85
|
+
|
86
|
+
desc 'status', 'Shows crawl status and lists jobs'
|
87
|
+
method_option :show_all, :type => :boolean, :desc => 'list all jobs'
|
88
|
+
def status
|
89
|
+
load_database
|
90
|
+
puts(Crawl.status(options[:show_all]))
|
91
|
+
end
|
92
|
+
|
93
|
+
desc 'reset CRAWL_NAME', 'Resets a crawl so its segments are parsed again'
|
94
|
+
def reset(crawl_name)
|
95
|
+
load_database
|
96
|
+
|
97
|
+
crawl = find_crawl(crawl_name)
|
98
|
+
if crawl.has_segments?
|
99
|
+
reset = agree('Reset crawl? (y/n)', true)
|
100
|
+
puts(crawl.reset) if reset == true
|
101
|
+
else
|
102
|
+
puts('ERROR: Crawl does not exist')
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
desc 'destroy', 'Deletes S3 bucket and config directory'
|
107
|
+
def destroy
|
108
|
+
config = Config.new
|
109
|
+
|
110
|
+
if config.dir_exists?
|
111
|
+
puts(config.delete_warning)
|
112
|
+
delete = agree('Delete? (y/n)', true)
|
113
|
+
puts(config.delete) if delete == true
|
114
|
+
else
|
115
|
+
puts('No config dir. Nothing to do')
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
private
|
120
|
+
# Find a crawl record in the database.
|
121
|
+
def find_crawl(crawl_name)
|
122
|
+
Crawl.where(:crawl_name => crawl_name).first_or_initialize
|
123
|
+
end
|
124
|
+
|
125
|
+
# Load sqlite database.
|
126
|
+
def load_database
|
127
|
+
config = Config.new
|
128
|
+
config.load_database
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
begin
|
134
|
+
Elasticrawl::Cli.start(ARGV)
|
135
|
+
# Show errors parsing command line arguments.
|
136
|
+
rescue Thor::Error => e
|
137
|
+
puts(e.message)
|
138
|
+
# Show elasticrawl errors.
|
139
|
+
rescue Elasticrawl::Error => e
|
140
|
+
puts("ERROR: #{e.message}")
|
141
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class CreateCrawlSegments < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
create_table :crawl_segments do |t|
|
4
|
+
t.references :crawl
|
5
|
+
t.string :segment_name
|
6
|
+
t.string :segment_s3_uri
|
7
|
+
t.datetime :parse_time
|
8
|
+
t.timestamps
|
9
|
+
end
|
10
|
+
|
11
|
+
add_index(:crawl_segments, :segment_name, :unique => true)
|
12
|
+
add_index(:crawl_segments, :segment_s3_uri, :unique => true)
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class CreateJobs < ActiveRecord::Migration
|
2
|
+
def change
|
3
|
+
create_table :jobs do |t|
|
4
|
+
t.string :type
|
5
|
+
t.string :job_name
|
6
|
+
t.string :job_desc
|
7
|
+
t.integer :max_files
|
8
|
+
t.string :job_flow_id
|
9
|
+
t.timestamps
|
10
|
+
end
|
11
|
+
|
12
|
+
add_index(:jobs, :job_name, :unique => true)
|
13
|
+
end
|
14
|
+
end
|
data/elasticrawl.gemspec
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'elasticrawl/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'elasticrawl'
|
8
|
+
spec.version = Elasticrawl::VERSION
|
9
|
+
spec.authors = ['Ross Fairbanks']
|
10
|
+
spec.email = ['ross@rossfairbanks.com']
|
11
|
+
spec.summary = %q{Launch AWS Elastic MapReduce jobs that process Common Crawl data.}
|
12
|
+
spec.description = %q{Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process Common Crawl data.}
|
13
|
+
spec.homepage = 'https://github.com/rossf7/elasticrawl'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_dependency 'activerecord', '~> 4.0.2'
|
22
|
+
spec.add_dependency 'activesupport', '~> 4.0.2'
|
23
|
+
spec.add_dependency 'aws-sdk', '~> 1.0'
|
24
|
+
spec.add_dependency 'elasticity', '~> 2.7'
|
25
|
+
spec.add_dependency 'highline', '~> 1.6.20'
|
26
|
+
spec.add_dependency 'sqlite3', '~> 1.3.8'
|
27
|
+
spec.add_dependency 'thor', '~> 0.18.1'
|
28
|
+
|
29
|
+
spec.add_development_dependency 'rake'
|
30
|
+
spec.add_development_dependency 'bundler', '~> 1.3'
|
31
|
+
spec.add_development_dependency 'rspec', '~> 2.14.1'
|
32
|
+
spec.add_development_dependency 'mocha', '~> 1.0.0'
|
33
|
+
spec.add_development_dependency 'database_cleaner', '~> 1.2.0'
|
34
|
+
spec.add_development_dependency 'shoulda-matchers', '~> 2.4.0'
|
35
|
+
end
|