elasticrawl 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +4 -3
- data/Cheffile +7 -10
- data/Cheffile.lock +20 -27
- data/README.md +29 -29
- data/Vagrantfile +20 -13
- data/elasticrawl.gemspec +12 -12
- data/lib/elasticrawl/cluster.rb +7 -2
- data/lib/elasticrawl/version.rb +1 -1
- data/spec/spec_helper.rb +3 -0
- data/spec/unit/crawl_segment_spec.rb +1 -1
- data/spec/unit/crawl_spec.rb +1 -1
- data/spec/unit/job_spec.rb +1 -1
- data/spec/unit/job_step_spec.rb +1 -1
- metadata +55 -55
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a7a988f505c699d078fa4cc15981b77f7924e907
|
|
4
|
+
data.tar.gz: db7ee7ae05ccec51b0ffde16900656c859523001
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 12949f3230c3a0f7d08d4e02a9b296968616fa634072db882fe26809c58791d06ebf48d21fbc3c703ab1f858fdf12cfa9c324a6a97a7e896a5916679c7d6de06
|
|
7
|
+
data.tar.gz: ab9ec066ecb469707241751087ff6d25ff89e9013fdb0a8505bca904fcaa3fed2b331ef289fa0ffb71dd970261ec7c0028ca955d94642fad7093191723d8c4e3
|
data/.travis.yml
CHANGED
data/Cheffile
CHANGED
|
@@ -1,14 +1,11 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
2
|
#^syntax detection
|
|
3
3
|
|
|
4
|
-
site "
|
|
4
|
+
site "https://supermarket.getchef.com/api/v1"
|
|
5
5
|
|
|
6
|
-
cookbook "apt",
|
|
7
|
-
|
|
8
|
-
cookbook "
|
|
9
|
-
cookbook "
|
|
10
|
-
cookbook "
|
|
11
|
-
|
|
12
|
-
:ref => "v0.7.2"
|
|
13
|
-
cookbook "ruby_build"
|
|
14
|
-
cookbook "vim"
|
|
6
|
+
cookbook "apt", "2.9.2"
|
|
7
|
+
cookbook "build-essential", "2.2.4"
|
|
8
|
+
cookbook "git", "4.3.5"
|
|
9
|
+
cookbook "ruby_rbenv", "1.0.1"
|
|
10
|
+
cookbook "ruby_build", "0.8.0"
|
|
11
|
+
cookbook "vim", "2.0.0"
|
data/Cheffile.lock
CHANGED
|
@@ -1,37 +1,30 @@
|
|
|
1
1
|
SITE
|
|
2
|
-
remote:
|
|
2
|
+
remote: https://supermarket.getchef.com/api/v1
|
|
3
3
|
specs:
|
|
4
|
-
apt (2.2
|
|
5
|
-
build-essential (
|
|
6
|
-
chef_handler (1.
|
|
7
|
-
dmg (2.0
|
|
8
|
-
git (
|
|
4
|
+
apt (2.9.2)
|
|
5
|
+
build-essential (2.2.4)
|
|
6
|
+
chef_handler (1.2.0)
|
|
7
|
+
dmg (2.3.0)
|
|
8
|
+
git (4.3.5)
|
|
9
9
|
build-essential (>= 0.0.0)
|
|
10
10
|
dmg (>= 0.0.0)
|
|
11
|
-
runit (>= 1.0.0)
|
|
12
11
|
windows (>= 0.0.0)
|
|
13
|
-
yum (>= 0.0.0)
|
|
12
|
+
yum-epel (>= 0.0.0)
|
|
14
13
|
ruby_build (0.8.0)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
windows (1.11.0)
|
|
14
|
+
ruby_rbenv (1.0.1)
|
|
15
|
+
ruby_build (>= 0.0.0)
|
|
16
|
+
vim (2.0.0)
|
|
17
|
+
windows (1.39.0)
|
|
20
18
|
chef_handler (>= 0.0.0)
|
|
21
|
-
yum (
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
remote: https://github.com/fnichol/chef-rbenv.git
|
|
25
|
-
ref: v0.7.2
|
|
26
|
-
sha: f2b53292e810dd2b43f6121f9958f5f29979dcb1
|
|
27
|
-
specs:
|
|
28
|
-
rbenv (0.7.2)
|
|
19
|
+
yum (3.8.2)
|
|
20
|
+
yum-epel (0.6.5)
|
|
21
|
+
yum (~> 3.2)
|
|
29
22
|
|
|
30
23
|
DEPENDENCIES
|
|
31
|
-
apt (
|
|
32
|
-
build-essential (
|
|
33
|
-
git (
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
vim (
|
|
24
|
+
apt (= 2.9.2)
|
|
25
|
+
build-essential (= 2.2.4)
|
|
26
|
+
git (= 4.3.5)
|
|
27
|
+
ruby_build (= 0.8.0)
|
|
28
|
+
ruby_rbenv (= 1.0.1)
|
|
29
|
+
vim (= 2.0.0)
|
|
37
30
|
|
data/README.md
CHANGED
|
@@ -1,20 +1,11 @@
|
|
|
1
1
|
# Elasticrawl
|
|
2
2
|
|
|
3
|
-
Command line tool for launching Hadoop jobs using AWS EMR (Elastic MapReduce) to process Common Crawl data.
|
|
4
|
-
Elasticrawl can be used with [crawl data](http://commoncrawl.org/the-data/get-started/) from April 2014 onwards.
|
|
3
|
+
* Command line tool for launching Hadoop jobs using AWS EMR (Elastic MapReduce) to process Common Crawl data.
|
|
4
|
+
* Elasticrawl can be used with [crawl data](http://commoncrawl.org/the-data/get-started/) from April 2014 onwards.
|
|
5
|
+
* A list of crawls released by Common Crawl is maintained on the [wiki](https://github.com/rossf7/elasticrawl/wiki).
|
|
6
|
+
* Common Crawl announce new crawls on their [blog](http://blog.commoncrawl.org/).
|
|
5
7
|
|
|
6
|
-
|
|
7
|
-
| -------------- |:---------:|:----------:|:-------:
|
|
8
|
-
| [CC-MAIN-2015-06](http://blog.commoncrawl.org/2015/03/january-2015-crawl-archive-available/) | January 2015 | ~ 1.82 billion | 98
|
|
9
|
-
| [CC-MAIN-2014-52](http://blog.commoncrawl.org/2015/01/december-2014-crawl-archive-available/) | December 2014 | ~ 2.08 billion | 314
|
|
10
|
-
| [CC-MAIN-2014-49](http://blog.commoncrawl.org/2014/12/november-2014-crawl-archive-available/) | November 2014 | ~ 1.95 billion | 136
|
|
11
|
-
| [CC-MAIN-2014-35](http://blog.commoncrawl.org/2014/09/august-2014-crawl-data-available/) | August 2014 | ~ 2.8 billion | 111
|
|
12
|
-
| [CC-MAIN-2014-23](http://blog.commoncrawl.org/2014/08/july-2014-crawl-data-available/) | July 2014 | ~ 3.6 billion | 253
|
|
13
|
-
| [CC-MAIN-2014-15](http://blog.commoncrawl.org/2014/07/april-2014-crawl-data-available/) | April 2014 | ~ 2.3 billion | 70
|
|
14
|
-
|
|
15
|
-
Common Crawl announce new crawls on their [blog](http://blog.commoncrawl.org/).
|
|
16
|
-
|
|
17
|
-
Ships with a default configuration that launches the
|
|
8
|
+
* Ships with a default configuration that launches the
|
|
18
9
|
[elasticrawl-examples](https://github.com/rossf7/elasticrawl-examples) jobs.
|
|
19
10
|
This is an implementation of the standard Hadoop Word Count example.
|
|
20
11
|
|
|
@@ -25,18 +16,27 @@ This [blog post](https://rossfairbanks.com/2015/01/03/parsing-common-crawl-using
|
|
|
25
16
|
Deployment packages are available for Linux and OS X, unfortunately Windows isn't supported yet. Download the package, extract it and run the elasticrawl command from the package directory.
|
|
26
17
|
|
|
27
18
|
```bash
|
|
28
|
-
# OS X https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.
|
|
29
|
-
# Linux (64-bit) https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.
|
|
30
|
-
# Linux (32-bit) https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.
|
|
19
|
+
# OS X https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.5-osx.tar.gz
|
|
20
|
+
# Linux (64-bit) https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.5-linux-x86_64.tar.gz
|
|
21
|
+
# Linux (32-bit) https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.5-linux-x86.tar.gz
|
|
31
22
|
|
|
32
23
|
# e.g.
|
|
33
24
|
|
|
34
|
-
curl -O https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.
|
|
35
|
-
tar -xzf elasticrawl-1.1.
|
|
36
|
-
cd elasticrawl-1.1.
|
|
25
|
+
curl -O https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.5-osx.tar.gz
|
|
26
|
+
tar -xzf elasticrawl-1.1.5-osx.tar.gz
|
|
27
|
+
cd elasticrawl-1.1.5-osx/
|
|
37
28
|
./elasticrawl --help
|
|
38
29
|
```
|
|
39
30
|
|
|
31
|
+
### Troubleshooting
|
|
32
|
+
|
|
33
|
+
If you get the error "EMR service role arn:aws:iam::156793023547:role/EMR_DefaultRole is invalid" when launching a cluster then you don't have the necessary IAM roles.
|
|
34
|
+
To fix this install the [AWS CLI](https://aws.amazon.com/cli/) and run the command below.
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
aws emr create-default-roles
|
|
38
|
+
```
|
|
39
|
+
|
|
40
40
|
## Commands
|
|
41
41
|
|
|
42
42
|
### elasticrawl init
|
|
@@ -62,13 +62,13 @@ Config complete
|
|
|
62
62
|
The parse command takes in the crawl name and an optional number of segments and files to parse.
|
|
63
63
|
|
|
64
64
|
```bash
|
|
65
|
-
~$ ./elasticrawl parse CC-MAIN-
|
|
65
|
+
~$ ./elasticrawl parse CC-MAIN-2015-48 --max-segments 2 --max-files 3
|
|
66
66
|
Segments
|
|
67
67
|
Segment: 1416400372202.67 Files: 150
|
|
68
68
|
Segment: 1416400372490.23 Files: 124
|
|
69
69
|
|
|
70
70
|
Job configuration
|
|
71
|
-
Crawl: CC-MAIN-
|
|
71
|
+
Crawl: CC-MAIN-2015-48 Segments: 2 Parsing: 3 files per segment
|
|
72
72
|
|
|
73
73
|
Cluster configuration
|
|
74
74
|
Master: 1 m1.medium (Spot: 0.12)
|
|
@@ -106,10 +106,10 @@ The status command shows crawls and your job history.
|
|
|
106
106
|
```bash
|
|
107
107
|
~$ ./elasticrawl status
|
|
108
108
|
Crawl Status
|
|
109
|
-
CC-MAIN-
|
|
109
|
+
CC-MAIN-2015-48 Segments: to parse 98, parsed 2, total 100
|
|
110
110
|
|
|
111
111
|
Job History (last 10)
|
|
112
|
-
1420124830792 Launched: 2015-01-01 15:07:10 Crawl: CC-MAIN-
|
|
112
|
+
1420124830792 Launched: 2015-01-01 15:07:10 Crawl: CC-MAIN-2015-48 Segments: 2 Parsing: 3 files per segment
|
|
113
113
|
```
|
|
114
114
|
|
|
115
115
|
### elasticrawl reset
|
|
@@ -117,10 +117,10 @@ Job History (last 10)
|
|
|
117
117
|
The reset comment resets a crawl so it is parsed again.
|
|
118
118
|
|
|
119
119
|
```bash
|
|
120
|
-
~$ ./elasticrawl reset CC-MAIN-
|
|
120
|
+
~$ ./elasticrawl reset CC-MAIN-2015-48
|
|
121
121
|
Reset crawl? (y/n)
|
|
122
122
|
y
|
|
123
|
-
CC-MAIN-
|
|
123
|
+
CC-MAIN-2015-48 Segments: to parse 100, parsed 0, total 100
|
|
124
124
|
```
|
|
125
125
|
|
|
126
126
|
### elasticrawl destroy
|
|
@@ -158,13 +158,13 @@ stores your S3 bucket name and the config for the parse and combine jobs
|
|
|
158
158
|
|
|
159
159
|
## Development
|
|
160
160
|
|
|
161
|
-
Elasticrawl is developed in Ruby and requires Ruby 2.0.0 or later (Ruby 2.
|
|
161
|
+
Elasticrawl is developed in Ruby and requires Ruby 2.0.0 or later (Ruby 2.2 is recommended). The sqlite3 and nokogiri gems have C extensions which mean you may need to install development headers.
|
|
162
162
|
|
|
163
163
|
[](http://badge.fury.io/rb/elasticrawl)
|
|
164
164
|
[](https://codeclimate.com/github/rossf7/elasticrawl)
|
|
165
|
-
[](https://travis-ci.org/rossf7/elasticrawl) 2.0.0, 2.1.
|
|
165
|
+
[](https://travis-ci.org/rossf7/elasticrawl) 2.0.0, 2.1.8, 2.2.4, 2.3.0
|
|
166
166
|
|
|
167
|
-
The deployment packages are created using [Traveling Ruby](http://phusion.github.io/traveling-ruby/). The deploy packages contain a Ruby 2.
|
|
167
|
+
The deployment packages are created using [Traveling Ruby](http://phusion.github.io/traveling-ruby/). The deploy packages contain a Ruby 2.2 interpreter, Gems and the compiled C extensions. The [traveling-elasticrawl](https://github.com/rossf7/traveling-elasticrawl) repository has a Rake task that automates building the deployment packages.
|
|
168
168
|
|
|
169
169
|
## TODO
|
|
170
170
|
|
data/Vagrantfile
CHANGED
|
@@ -6,17 +6,17 @@ Vagrant.configure("2") do |config|
|
|
|
6
6
|
# options are documented and commented below. For a complete reference,
|
|
7
7
|
# please see the online documentation at vagrantup.com.
|
|
8
8
|
|
|
9
|
-
#
|
|
9
|
+
# Increase RAM to 1 GB
|
|
10
10
|
config.vm.provider "virtualbox" do |vbox|
|
|
11
|
-
vbox.customize ["modifyvm", :id, "--
|
|
11
|
+
vbox.customize ["modifyvm", :id, "--memory", 1024]
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
# Elasticrawl launches Hadoop jobs for the CommonCrawl dataset using the AWS EMR service.
|
|
15
15
|
config.vm.define :elasticrawl do |elasticrawl|
|
|
16
16
|
elasticrawl.vm.box = "elasticrawl"
|
|
17
17
|
|
|
18
|
-
# Ubuntu Server
|
|
19
|
-
elasticrawl.vm.
|
|
18
|
+
# Ubuntu Server 14.04 LTS
|
|
19
|
+
elasticrawl.vm.box = "ubuntu/trusty64"
|
|
20
20
|
|
|
21
21
|
# Network config
|
|
22
22
|
elasticrawl.vm.network :public_network
|
|
@@ -30,7 +30,7 @@ Vagrant.configure("2") do |config|
|
|
|
30
30
|
chef.add_recipe "apt"
|
|
31
31
|
chef.add_recipe "build-essential"
|
|
32
32
|
chef.add_recipe "ruby_build"
|
|
33
|
-
chef.add_recipe "
|
|
33
|
+
chef.add_recipe "ruby_rbenv::user"
|
|
34
34
|
chef.add_recipe "git"
|
|
35
35
|
chef.add_recipe "vim"
|
|
36
36
|
|
|
@@ -39,17 +39,24 @@ Vagrant.configure("2") do |config|
|
|
|
39
39
|
"user_installs" => [
|
|
40
40
|
{
|
|
41
41
|
"user" => "vagrant",
|
|
42
|
-
"rubies" => ["2.0.0-
|
|
43
|
-
"global" => "2.
|
|
42
|
+
"rubies" => ["2.0.0-p648", "2.1.8", "2.2.4", "2.3.0"],
|
|
43
|
+
"global" => "2.2.4",
|
|
44
44
|
"gems" => {
|
|
45
|
-
"2.0.0-
|
|
46
|
-
{ "name" => "bundler"
|
|
45
|
+
"2.0.0-p648" => [
|
|
46
|
+
{ "name" => "bundler",
|
|
47
|
+
"version" => "1.11.2" }
|
|
47
48
|
],
|
|
48
|
-
"2.1.
|
|
49
|
-
{ "name" => "bundler"
|
|
49
|
+
"2.1.8" => [
|
|
50
|
+
{ "name" => "bundler",
|
|
51
|
+
"version" => "1.11.2" }
|
|
50
52
|
],
|
|
51
|
-
"2.2.
|
|
52
|
-
{ "name" => "bundler"
|
|
53
|
+
"2.2.4" => [
|
|
54
|
+
{ "name" => "bundler",
|
|
55
|
+
"version" => "1.11.2" }
|
|
56
|
+
],
|
|
57
|
+
"2.3.0" => [
|
|
58
|
+
{ "name" => "bundler",
|
|
59
|
+
"version" => "1.11.2" }
|
|
53
60
|
]
|
|
54
61
|
}
|
|
55
62
|
}
|
data/elasticrawl.gemspec
CHANGED
|
@@ -18,17 +18,17 @@ Gem::Specification.new do |spec|
|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
19
19
|
spec.require_paths = ['lib']
|
|
20
20
|
|
|
21
|
-
spec.add_dependency 'activerecord', '~> 4.2'
|
|
22
|
-
spec.add_dependency 'activesupport', '~> 4.2'
|
|
23
|
-
spec.add_dependency 'aws-sdk', '~> 1.
|
|
24
|
-
spec.add_dependency 'elasticity', '~>
|
|
25
|
-
spec.add_dependency 'highline', '~> 1.
|
|
26
|
-
spec.add_dependency 'sqlite3', '~> 1.3'
|
|
27
|
-
spec.add_dependency 'thor', '~> 0.19'
|
|
21
|
+
spec.add_dependency 'activerecord', '~> 4.2.5'
|
|
22
|
+
spec.add_dependency 'activesupport', '~> 4.2.5'
|
|
23
|
+
spec.add_dependency 'aws-sdk', '~> 1.66.0'
|
|
24
|
+
spec.add_dependency 'elasticity', '~> 6.0.5'
|
|
25
|
+
spec.add_dependency 'highline', '~> 1.7.8'
|
|
26
|
+
spec.add_dependency 'sqlite3', '~> 1.3.11'
|
|
27
|
+
spec.add_dependency 'thor', '~> 0.19.1'
|
|
28
28
|
|
|
29
|
-
spec.add_development_dependency 'rake'
|
|
30
|
-
spec.add_development_dependency 'bundler', '~> 1.
|
|
31
|
-
spec.add_development_dependency 'rspec', '~> 3.
|
|
32
|
-
spec.add_development_dependency 'database_cleaner', '~> 1.
|
|
33
|
-
spec.add_development_dependency 'shoulda-matchers', '~>
|
|
29
|
+
spec.add_development_dependency 'rake', '~> 10.4.2'
|
|
30
|
+
spec.add_development_dependency 'bundler', '~> 1.11.2'
|
|
31
|
+
spec.add_development_dependency 'rspec', '~> 3.4.0'
|
|
32
|
+
spec.add_development_dependency 'database_cleaner', '~> 1.5.1'
|
|
33
|
+
spec.add_development_dependency 'shoulda-matchers', '~> 3.0.1'
|
|
34
34
|
end
|
data/lib/elasticrawl/cluster.rb
CHANGED
|
@@ -11,8 +11,13 @@ module Elasticrawl
|
|
|
11
11
|
# Returns a configured job flow to the calling job.
|
|
12
12
|
def create_job_flow(job, emr_config = nil)
|
|
13
13
|
config = Config.new
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
|
|
15
|
+
Elasticity.configure do |c|
|
|
16
|
+
c.access_key = config.access_key_id
|
|
17
|
+
c.secret_key = config.secret_access_key
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
job_flow = Elasticity::JobFlow.new
|
|
16
21
|
job_flow.name = "Job: #{job.job_name} #{job.job_desc}"
|
|
17
22
|
job_flow.log_uri = job.log_uri
|
|
18
23
|
|
data/lib/elasticrawl/version.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
|
@@ -7,6 +7,9 @@ RSpec.configure do |config|
|
|
|
7
7
|
# Run each test in a transaction and rollback data on completion.
|
|
8
8
|
DatabaseCleaner.strategy = :transaction
|
|
9
9
|
|
|
10
|
+
# Use Shoulda matchers for schema tests.
|
|
11
|
+
config.include(Shoulda::Matchers::ActiveRecord, type: :model)
|
|
12
|
+
|
|
10
13
|
config.before(:each) do
|
|
11
14
|
# Stub S3 call to get WARC file paths
|
|
12
15
|
warc_paths = IO.read(File.join(File.dirname(__FILE__), 'fixtures', 'warc.paths'))
|
data/spec/unit/crawl_spec.rb
CHANGED
data/spec/unit/job_spec.rb
CHANGED
data/spec/unit/job_step_spec.rb
CHANGED
metadata
CHANGED
|
@@ -1,183 +1,183 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: elasticrawl
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.1.
|
|
4
|
+
version: 1.1.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ross Fairbanks
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2016-01-05 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activerecord
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- - ~>
|
|
17
|
+
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version:
|
|
19
|
+
version: 4.2.5
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- - ~>
|
|
24
|
+
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version:
|
|
26
|
+
version: 4.2.5
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
28
|
name: activesupport
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
|
-
- - ~>
|
|
31
|
+
- - "~>"
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
|
-
version:
|
|
33
|
+
version: 4.2.5
|
|
34
34
|
type: :runtime
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
|
-
- - ~>
|
|
38
|
+
- - "~>"
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
|
-
version:
|
|
40
|
+
version: 4.2.5
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
42
|
name: aws-sdk
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
44
44
|
requirements:
|
|
45
|
-
- - ~>
|
|
45
|
+
- - "~>"
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
|
-
version:
|
|
47
|
+
version: 1.66.0
|
|
48
48
|
type: :runtime
|
|
49
49
|
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
|
-
- - ~>
|
|
52
|
+
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
|
-
version:
|
|
54
|
+
version: 1.66.0
|
|
55
55
|
- !ruby/object:Gem::Dependency
|
|
56
56
|
name: elasticity
|
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
|
58
58
|
requirements:
|
|
59
|
-
- - ~>
|
|
59
|
+
- - "~>"
|
|
60
60
|
- !ruby/object:Gem::Version
|
|
61
|
-
version:
|
|
61
|
+
version: 6.0.5
|
|
62
62
|
type: :runtime
|
|
63
63
|
prerelease: false
|
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
65
|
requirements:
|
|
66
|
-
- - ~>
|
|
66
|
+
- - "~>"
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
|
-
version:
|
|
68
|
+
version: 6.0.5
|
|
69
69
|
- !ruby/object:Gem::Dependency
|
|
70
70
|
name: highline
|
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
|
72
72
|
requirements:
|
|
73
|
-
- - ~>
|
|
73
|
+
- - "~>"
|
|
74
74
|
- !ruby/object:Gem::Version
|
|
75
|
-
version:
|
|
75
|
+
version: 1.7.8
|
|
76
76
|
type: :runtime
|
|
77
77
|
prerelease: false
|
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
|
79
79
|
requirements:
|
|
80
|
-
- - ~>
|
|
80
|
+
- - "~>"
|
|
81
81
|
- !ruby/object:Gem::Version
|
|
82
|
-
version:
|
|
82
|
+
version: 1.7.8
|
|
83
83
|
- !ruby/object:Gem::Dependency
|
|
84
84
|
name: sqlite3
|
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
|
86
86
|
requirements:
|
|
87
|
-
- - ~>
|
|
87
|
+
- - "~>"
|
|
88
88
|
- !ruby/object:Gem::Version
|
|
89
|
-
version:
|
|
89
|
+
version: 1.3.11
|
|
90
90
|
type: :runtime
|
|
91
91
|
prerelease: false
|
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
93
|
requirements:
|
|
94
|
-
- - ~>
|
|
94
|
+
- - "~>"
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
|
-
version:
|
|
96
|
+
version: 1.3.11
|
|
97
97
|
- !ruby/object:Gem::Dependency
|
|
98
98
|
name: thor
|
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
|
100
100
|
requirements:
|
|
101
|
-
- - ~>
|
|
101
|
+
- - "~>"
|
|
102
102
|
- !ruby/object:Gem::Version
|
|
103
|
-
version:
|
|
103
|
+
version: 0.19.1
|
|
104
104
|
type: :runtime
|
|
105
105
|
prerelease: false
|
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
|
107
107
|
requirements:
|
|
108
|
-
- - ~>
|
|
108
|
+
- - "~>"
|
|
109
109
|
- !ruby/object:Gem::Version
|
|
110
|
-
version:
|
|
110
|
+
version: 0.19.1
|
|
111
111
|
- !ruby/object:Gem::Dependency
|
|
112
112
|
name: rake
|
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
|
114
114
|
requirements:
|
|
115
|
-
- -
|
|
115
|
+
- - "~>"
|
|
116
116
|
- !ruby/object:Gem::Version
|
|
117
|
-
version:
|
|
117
|
+
version: 10.4.2
|
|
118
118
|
type: :development
|
|
119
119
|
prerelease: false
|
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
|
121
121
|
requirements:
|
|
122
|
-
- -
|
|
122
|
+
- - "~>"
|
|
123
123
|
- !ruby/object:Gem::Version
|
|
124
|
-
version:
|
|
124
|
+
version: 10.4.2
|
|
125
125
|
- !ruby/object:Gem::Dependency
|
|
126
126
|
name: bundler
|
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
|
128
128
|
requirements:
|
|
129
|
-
- - ~>
|
|
129
|
+
- - "~>"
|
|
130
130
|
- !ruby/object:Gem::Version
|
|
131
|
-
version:
|
|
131
|
+
version: 1.11.2
|
|
132
132
|
type: :development
|
|
133
133
|
prerelease: false
|
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
|
135
135
|
requirements:
|
|
136
|
-
- - ~>
|
|
136
|
+
- - "~>"
|
|
137
137
|
- !ruby/object:Gem::Version
|
|
138
|
-
version:
|
|
138
|
+
version: 1.11.2
|
|
139
139
|
- !ruby/object:Gem::Dependency
|
|
140
140
|
name: rspec
|
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
|
142
142
|
requirements:
|
|
143
|
-
- - ~>
|
|
143
|
+
- - "~>"
|
|
144
144
|
- !ruby/object:Gem::Version
|
|
145
|
-
version:
|
|
145
|
+
version: 3.4.0
|
|
146
146
|
type: :development
|
|
147
147
|
prerelease: false
|
|
148
148
|
version_requirements: !ruby/object:Gem::Requirement
|
|
149
149
|
requirements:
|
|
150
|
-
- - ~>
|
|
150
|
+
- - "~>"
|
|
151
151
|
- !ruby/object:Gem::Version
|
|
152
|
-
version:
|
|
152
|
+
version: 3.4.0
|
|
153
153
|
- !ruby/object:Gem::Dependency
|
|
154
154
|
name: database_cleaner
|
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
|
156
156
|
requirements:
|
|
157
|
-
- - ~>
|
|
157
|
+
- - "~>"
|
|
158
158
|
- !ruby/object:Gem::Version
|
|
159
|
-
version: 1.
|
|
159
|
+
version: 1.5.1
|
|
160
160
|
type: :development
|
|
161
161
|
prerelease: false
|
|
162
162
|
version_requirements: !ruby/object:Gem::Requirement
|
|
163
163
|
requirements:
|
|
164
|
-
- - ~>
|
|
164
|
+
- - "~>"
|
|
165
165
|
- !ruby/object:Gem::Version
|
|
166
|
-
version: 1.
|
|
166
|
+
version: 1.5.1
|
|
167
167
|
- !ruby/object:Gem::Dependency
|
|
168
168
|
name: shoulda-matchers
|
|
169
169
|
requirement: !ruby/object:Gem::Requirement
|
|
170
170
|
requirements:
|
|
171
|
-
- - ~>
|
|
171
|
+
- - "~>"
|
|
172
172
|
- !ruby/object:Gem::Version
|
|
173
|
-
version:
|
|
173
|
+
version: 3.0.1
|
|
174
174
|
type: :development
|
|
175
175
|
prerelease: false
|
|
176
176
|
version_requirements: !ruby/object:Gem::Requirement
|
|
177
177
|
requirements:
|
|
178
|
-
- - ~>
|
|
178
|
+
- - "~>"
|
|
179
179
|
- !ruby/object:Gem::Version
|
|
180
|
-
version:
|
|
180
|
+
version: 3.0.1
|
|
181
181
|
description: Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process
|
|
182
182
|
Common Crawl data.
|
|
183
183
|
email:
|
|
@@ -187,8 +187,8 @@ executables:
|
|
|
187
187
|
extensions: []
|
|
188
188
|
extra_rdoc_files: []
|
|
189
189
|
files:
|
|
190
|
-
- .gitignore
|
|
191
|
-
- .travis.yml
|
|
190
|
+
- ".gitignore"
|
|
191
|
+
- ".travis.yml"
|
|
192
192
|
- CHANGELOG.md
|
|
193
193
|
- Cheffile
|
|
194
194
|
- Cheffile.lock
|
|
@@ -241,17 +241,17 @@ require_paths:
|
|
|
241
241
|
- lib
|
|
242
242
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
243
243
|
requirements:
|
|
244
|
-
- -
|
|
244
|
+
- - ">="
|
|
245
245
|
- !ruby/object:Gem::Version
|
|
246
246
|
version: '0'
|
|
247
247
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
248
248
|
requirements:
|
|
249
|
-
- -
|
|
249
|
+
- - ">="
|
|
250
250
|
- !ruby/object:Gem::Version
|
|
251
251
|
version: '0'
|
|
252
252
|
requirements: []
|
|
253
253
|
rubyforge_project:
|
|
254
|
-
rubygems_version: 2.
|
|
254
|
+
rubygems_version: 2.4.5.1
|
|
255
255
|
signing_key:
|
|
256
256
|
specification_version: 4
|
|
257
257
|
summary: Launch AWS Elastic MapReduce jobs that process Common Crawl data.
|