elasticrawl 1.1.3 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/CHANGELOG.md +5 -5
- data/README.md +10 -9
- data/Vagrantfile +4 -4
- data/lib/elasticrawl/cluster.rb +4 -0
- data/lib/elasticrawl/crawl_segment.rb +1 -1
- data/lib/elasticrawl/version.rb +1 -1
- data/templates/cluster.yml +6 -0
- metadata +31 -31
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2a1024132524ee242004a94b6d8d029f8d77a54e
|
|
4
|
+
data.tar.gz: 11269b2244d5d7ecddb9a09b02341aae581afeb7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1d14f20dd70d8e27a7c68b96c6fbe9e81505189a8dfc827997599ba2abb12095cab90abbca010ecafd81499b84fae1e4b9905218d15b9bf517c12e22241e6182
|
|
7
|
+
data.tar.gz: 606e4ded5b041880d48049bb1de91182757a01bbc8bdb835df516d57e90b8155ac595b27177ea94cd7cad0730786045d1fc3fcf0b4fdd690e7d8f0e5d35ce4c4
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
|
|
1
|
+
## v1.1.3 / 2015-02-04
|
|
2
2
|
* Upgrade Traveling Ruby to 20150204-2.1.5
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
## v1.1.2 / 2015-01-27
|
|
5
5
|
* Improve error handling for S3 API calls
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
## v1.1.1 / 2015-01-27
|
|
8
8
|
* Use Traveling Ruby to build deploy packages
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
## v1.1.0 / 2015-01-03
|
|
11
11
|
* Show file counts for each segment
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
## v1.0.0 / 2014-02-04
|
|
14
14
|
* Initial release
|
data/README.md
CHANGED
|
@@ -3,13 +3,14 @@
|
|
|
3
3
|
Command line tool for launching Hadoop jobs using AWS EMR (Elastic MapReduce) to process Common Crawl data.
|
|
4
4
|
Elasticrawl can be used with [crawl data](http://commoncrawl.org/the-data/get-started/) from April 2014 onwards.
|
|
5
5
|
|
|
6
|
-
| Crawl Name | Month | Web Pages
|
|
7
|
-
| --------------
|
|
8
|
-
| [CC-MAIN-
|
|
9
|
-
| [CC-MAIN-2014-
|
|
10
|
-
| [CC-MAIN-2014-
|
|
11
|
-
| [CC-MAIN-2014-
|
|
12
|
-
| [CC-MAIN-2014-
|
|
6
|
+
| Crawl Name | Month | Web Pages | Segments
|
|
7
|
+
| -------------- |:---------:|:----------:|:-------:
|
|
8
|
+
| [CC-MAIN-2015-06](http://blog.commoncrawl.org/2015/03/january-2015-crawl-archive-available/) | January 2015 | ~ 1.82 billion | 98
|
|
9
|
+
| [CC-MAIN-2014-52](http://blog.commoncrawl.org/2015/01/december-2014-crawl-archive-available/) | December 2014 | ~ 2.08 billion | 314
|
|
10
|
+
| [CC-MAIN-2014-49](http://blog.commoncrawl.org/2014/12/november-2014-crawl-archive-available/) | November 2014 | ~ 1.95 billion | 136
|
|
11
|
+
| [CC-MAIN-2014-35](http://blog.commoncrawl.org/2014/09/august-2014-crawl-data-available/) | August 2014 | ~ 2.8 billion | 111
|
|
12
|
+
| [CC-MAIN-2014-23](http://blog.commoncrawl.org/2014/08/july-2014-crawl-data-available/) | July 2014 | ~ 3.6 billion | 253
|
|
13
|
+
| [CC-MAIN-2014-15](http://blog.commoncrawl.org/2014/07/april-2014-crawl-data-available/) | April 2014 | ~ 2.3 billion | 70
|
|
13
14
|
|
|
14
15
|
Common Crawl announce new crawls on their [blog](http://blog.commoncrawl.org/).
|
|
15
16
|
|
|
@@ -157,11 +158,11 @@ stores your S3 bucket name and the config for the parse and combine jobs
|
|
|
157
158
|
|
|
158
159
|
## Development
|
|
159
160
|
|
|
160
|
-
Elasticrawl is developed in Ruby and requires Ruby
|
|
161
|
+
Elasticrawl is developed in Ruby and requires Ruby 2.0.0 or later (Ruby 2.1 is recommended). The sqlite3 and nokogiri gems have C extensions which mean you may need to install development headers.
|
|
161
162
|
|
|
162
163
|
[](http://badge.fury.io/rb/elasticrawl)
|
|
163
164
|
[](https://codeclimate.com/github/rossf7/elasticrawl)
|
|
164
|
-
[](https://travis-ci.org/rossf7/elasticrawl)
|
|
165
|
+
[](https://travis-ci.org/rossf7/elasticrawl) 2.0.0, 2.1.5, 2.2.0
|
|
165
166
|
|
|
166
167
|
The deployment packages are created using [Traveling Ruby](http://phusion.github.io/traveling-ruby/). The deploy packages contain a Ruby 2.1 interpreter, Gems and the compiled C extensions. The [traveling-elasticrawl](https://github.com/rossf7/traveling-elasticrawl) repository has a Rake task that automates building the deployment packages.
|
|
167
168
|
|
data/Vagrantfile
CHANGED
|
@@ -39,16 +39,16 @@ Vagrant.configure("2") do |config|
|
|
|
39
39
|
"user_installs" => [
|
|
40
40
|
{
|
|
41
41
|
"user" => "vagrant",
|
|
42
|
-
"rubies" => ["
|
|
42
|
+
"rubies" => ["2.0.0-p643", "2.1.5", "2.2.0"],
|
|
43
43
|
"global" => "2.1.5",
|
|
44
44
|
"gems" => {
|
|
45
|
-
"
|
|
45
|
+
"2.0.0-p643" => [
|
|
46
46
|
{ "name" => "bundler" }
|
|
47
47
|
],
|
|
48
|
-
"2.
|
|
48
|
+
"2.1.5" => [
|
|
49
49
|
{ "name" => "bundler" }
|
|
50
50
|
],
|
|
51
|
-
"2.
|
|
51
|
+
"2.2.0" => [
|
|
52
52
|
{ "name" => "bundler" }
|
|
53
53
|
]
|
|
54
54
|
}
|
data/lib/elasticrawl/cluster.rb
CHANGED
|
@@ -40,10 +40,14 @@ HERE
|
|
|
40
40
|
ec2_key_name = config_setting('ec2_key_name')
|
|
41
41
|
placement = config_setting('placement')
|
|
42
42
|
emr_ami_version = config_setting('emr_ami_version')
|
|
43
|
+
job_flow_role = config_setting('job_flow_role')
|
|
44
|
+
service_role = config_setting('service_role')
|
|
43
45
|
|
|
44
46
|
job_flow.ec2_key_name = ec2_key_name if ec2_key_name.present?
|
|
45
47
|
job_flow.placement = placement if placement.present?
|
|
46
48
|
job_flow.ami_version = emr_ami_version if emr_ami_version.present?
|
|
49
|
+
job_flow.job_flow_role = job_flow_role if job_flow_role.present?
|
|
50
|
+
job_flow.service_role = service_role if service_role.present?
|
|
47
51
|
end
|
|
48
52
|
|
|
49
53
|
# Configures the instances that will be launched. The master group has
|
data/lib/elasticrawl/version.rb
CHANGED
data/templates/cluster.yml
CHANGED
metadata
CHANGED
|
@@ -1,181 +1,181 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: elasticrawl
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.1.
|
|
4
|
+
version: 1.1.4
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ross Fairbanks
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2015-
|
|
11
|
+
date: 2015-07-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: activerecord
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- -
|
|
17
|
+
- - ~>
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
19
|
version: '4.2'
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- -
|
|
24
|
+
- - ~>
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '4.2'
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
28
|
name: activesupport
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
|
-
- -
|
|
31
|
+
- - ~>
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
33
|
version: '4.2'
|
|
34
34
|
type: :runtime
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
|
-
- -
|
|
38
|
+
- - ~>
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '4.2'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
42
|
name: aws-sdk
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
44
44
|
requirements:
|
|
45
|
-
- -
|
|
45
|
+
- - ~>
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
47
|
version: '1.60'
|
|
48
48
|
type: :runtime
|
|
49
49
|
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
|
-
- -
|
|
52
|
+
- - ~>
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
54
|
version: '1.60'
|
|
55
55
|
- !ruby/object:Gem::Dependency
|
|
56
56
|
name: elasticity
|
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
|
58
58
|
requirements:
|
|
59
|
-
- -
|
|
59
|
+
- - ~>
|
|
60
60
|
- !ruby/object:Gem::Version
|
|
61
61
|
version: '4.0'
|
|
62
62
|
type: :runtime
|
|
63
63
|
prerelease: false
|
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
65
|
requirements:
|
|
66
|
-
- -
|
|
66
|
+
- - ~>
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
68
|
version: '4.0'
|
|
69
69
|
- !ruby/object:Gem::Dependency
|
|
70
70
|
name: highline
|
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
|
72
72
|
requirements:
|
|
73
|
-
- -
|
|
73
|
+
- - ~>
|
|
74
74
|
- !ruby/object:Gem::Version
|
|
75
75
|
version: '1.6'
|
|
76
76
|
type: :runtime
|
|
77
77
|
prerelease: false
|
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
|
79
79
|
requirements:
|
|
80
|
-
- -
|
|
80
|
+
- - ~>
|
|
81
81
|
- !ruby/object:Gem::Version
|
|
82
82
|
version: '1.6'
|
|
83
83
|
- !ruby/object:Gem::Dependency
|
|
84
84
|
name: sqlite3
|
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
|
86
86
|
requirements:
|
|
87
|
-
- -
|
|
87
|
+
- - ~>
|
|
88
88
|
- !ruby/object:Gem::Version
|
|
89
89
|
version: '1.3'
|
|
90
90
|
type: :runtime
|
|
91
91
|
prerelease: false
|
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
93
|
requirements:
|
|
94
|
-
- -
|
|
94
|
+
- - ~>
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
96
|
version: '1.3'
|
|
97
97
|
- !ruby/object:Gem::Dependency
|
|
98
98
|
name: thor
|
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
|
100
100
|
requirements:
|
|
101
|
-
- -
|
|
101
|
+
- - ~>
|
|
102
102
|
- !ruby/object:Gem::Version
|
|
103
103
|
version: '0.19'
|
|
104
104
|
type: :runtime
|
|
105
105
|
prerelease: false
|
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
|
107
107
|
requirements:
|
|
108
|
-
- -
|
|
108
|
+
- - ~>
|
|
109
109
|
- !ruby/object:Gem::Version
|
|
110
110
|
version: '0.19'
|
|
111
111
|
- !ruby/object:Gem::Dependency
|
|
112
112
|
name: rake
|
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
|
114
114
|
requirements:
|
|
115
|
-
- -
|
|
115
|
+
- - '>='
|
|
116
116
|
- !ruby/object:Gem::Version
|
|
117
117
|
version: '0'
|
|
118
118
|
type: :development
|
|
119
119
|
prerelease: false
|
|
120
120
|
version_requirements: !ruby/object:Gem::Requirement
|
|
121
121
|
requirements:
|
|
122
|
-
- -
|
|
122
|
+
- - '>='
|
|
123
123
|
- !ruby/object:Gem::Version
|
|
124
124
|
version: '0'
|
|
125
125
|
- !ruby/object:Gem::Dependency
|
|
126
126
|
name: bundler
|
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
|
128
128
|
requirements:
|
|
129
|
-
- -
|
|
129
|
+
- - ~>
|
|
130
130
|
- !ruby/object:Gem::Version
|
|
131
131
|
version: '1.3'
|
|
132
132
|
type: :development
|
|
133
133
|
prerelease: false
|
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
|
135
135
|
requirements:
|
|
136
|
-
- -
|
|
136
|
+
- - ~>
|
|
137
137
|
- !ruby/object:Gem::Version
|
|
138
138
|
version: '1.3'
|
|
139
139
|
- !ruby/object:Gem::Dependency
|
|
140
140
|
name: rspec
|
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
|
142
142
|
requirements:
|
|
143
|
-
- -
|
|
143
|
+
- - ~>
|
|
144
144
|
- !ruby/object:Gem::Version
|
|
145
145
|
version: '3.1'
|
|
146
146
|
type: :development
|
|
147
147
|
prerelease: false
|
|
148
148
|
version_requirements: !ruby/object:Gem::Requirement
|
|
149
149
|
requirements:
|
|
150
|
-
- -
|
|
150
|
+
- - ~>
|
|
151
151
|
- !ruby/object:Gem::Version
|
|
152
152
|
version: '3.1'
|
|
153
153
|
- !ruby/object:Gem::Dependency
|
|
154
154
|
name: database_cleaner
|
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
|
156
156
|
requirements:
|
|
157
|
-
- -
|
|
157
|
+
- - ~>
|
|
158
158
|
- !ruby/object:Gem::Version
|
|
159
159
|
version: 1.3.0
|
|
160
160
|
type: :development
|
|
161
161
|
prerelease: false
|
|
162
162
|
version_requirements: !ruby/object:Gem::Requirement
|
|
163
163
|
requirements:
|
|
164
|
-
- -
|
|
164
|
+
- - ~>
|
|
165
165
|
- !ruby/object:Gem::Version
|
|
166
166
|
version: 1.3.0
|
|
167
167
|
- !ruby/object:Gem::Dependency
|
|
168
168
|
name: shoulda-matchers
|
|
169
169
|
requirement: !ruby/object:Gem::Requirement
|
|
170
170
|
requirements:
|
|
171
|
-
- -
|
|
171
|
+
- - ~>
|
|
172
172
|
- !ruby/object:Gem::Version
|
|
173
173
|
version: 2.7.0
|
|
174
174
|
type: :development
|
|
175
175
|
prerelease: false
|
|
176
176
|
version_requirements: !ruby/object:Gem::Requirement
|
|
177
177
|
requirements:
|
|
178
|
-
- -
|
|
178
|
+
- - ~>
|
|
179
179
|
- !ruby/object:Gem::Version
|
|
180
180
|
version: 2.7.0
|
|
181
181
|
description: Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process
|
|
@@ -187,8 +187,8 @@ executables:
|
|
|
187
187
|
extensions: []
|
|
188
188
|
extra_rdoc_files: []
|
|
189
189
|
files:
|
|
190
|
-
-
|
|
191
|
-
-
|
|
190
|
+
- .gitignore
|
|
191
|
+
- .travis.yml
|
|
192
192
|
- CHANGELOG.md
|
|
193
193
|
- Cheffile
|
|
194
194
|
- Cheffile.lock
|
|
@@ -241,17 +241,17 @@ require_paths:
|
|
|
241
241
|
- lib
|
|
242
242
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
243
243
|
requirements:
|
|
244
|
-
- -
|
|
244
|
+
- - '>='
|
|
245
245
|
- !ruby/object:Gem::Version
|
|
246
246
|
version: '0'
|
|
247
247
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
248
248
|
requirements:
|
|
249
|
-
- -
|
|
249
|
+
- - '>='
|
|
250
250
|
- !ruby/object:Gem::Version
|
|
251
251
|
version: '0'
|
|
252
252
|
requirements: []
|
|
253
253
|
rubyforge_project:
|
|
254
|
-
rubygems_version: 2.
|
|
254
|
+
rubygems_version: 2.0.14
|
|
255
255
|
signing_key:
|
|
256
256
|
specification_version: 4
|
|
257
257
|
summary: Launch AWS Elastic MapReduce jobs that process Common Crawl data.
|