elasticrawl 1.1.3 → 1.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1b89b4d0a38bb042d8d11d8f995ae1f33a624e00
4
- data.tar.gz: 00b15707dbbc3c828fe505c94ef6537fdf11e5b1
3
+ metadata.gz: 2a1024132524ee242004a94b6d8d029f8d77a54e
4
+ data.tar.gz: 11269b2244d5d7ecddb9a09b02341aae581afeb7
5
5
  SHA512:
6
- metadata.gz: a032013bb65c06eedc850d4a0e8b986bd76a92d2fec9c08712e718733749dce8c0a17c12db8b66b67491b1b1726cd452cb9e19244f6332e92e5ab878a169fc8f
7
- data.tar.gz: 3c45a331fd2b81335b051c6d0d371626ea3c8f440ddac934ae95bf7d99b7aed1ffc69483a46763d905f48d7829a1beeba13b0559281927324bab129907c33def
6
+ metadata.gz: 1d14f20dd70d8e27a7c68b96c6fbe9e81505189a8dfc827997599ba2abb12095cab90abbca010ecafd81499b84fae1e4b9905218d15b9bf517c12e22241e6182
7
+ data.tar.gz: 606e4ded5b041880d48049bb1de91182757a01bbc8bdb835df516d57e90b8155ac595b27177ea94cd7cad0730786045d1fc3fcf0b4fdd690e7d8f0e5d35ce4c4
data/.travis.yml CHANGED
@@ -1,6 +1,5 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 1.9.3
4
3
  - 2.0.0
5
4
  - 2.1.5
6
5
  - 2.2.0
data/CHANGELOG.md CHANGED
@@ -1,14 +1,14 @@
1
- === v1.1.3 / 2015-02-04
1
+ ## v1.1.3 / 2015-02-04
2
2
  * Upgrade Traveling Ruby to 20150204-2.1.5
3
3
 
4
- === v1.1.2 / 2015-01-27
4
+ ## v1.1.2 / 2015-01-27
5
5
  * Improve error handling for S3 API calls
6
6
 
7
- === v1.1.1 / 2015-01-27
7
+ ## v1.1.1 / 2015-01-27
8
8
  * Use Traveling Ruby to build deploy packages
9
9
 
10
- === v1.1.0 / 2015-01-03
10
+ ## v1.1.0 / 2015-01-03
11
11
  * Show file counts for each segment
12
12
 
13
- === v1.0.0 / 2014-02-04
13
+ ## v1.0.0 / 2014-02-04
14
14
  * Initial release
data/README.md CHANGED
@@ -3,13 +3,14 @@
3
3
  Command line tool for launching Hadoop jobs using AWS EMR (Elastic MapReduce) to process Common Crawl data.
4
4
  Elasticrawl can be used with [crawl data](http://commoncrawl.org/the-data/get-started/) from April 2014 onwards.
5
5
 
6
- | Crawl Name | Month | Web Pages
7
- | -------------- |:--------:|:--------:|
8
- | [CC-MAIN-2014-52](http://blog.commoncrawl.org/2015/01/december-2014-crawl-archive-available/) | December 2014 | ~ 2.08 billion
9
- | [CC-MAIN-2014-49](http://blog.commoncrawl.org/2014/12/november-2014-crawl-archive-available/) | November 2014 | ~ 1.95 billion
10
- | [CC-MAIN-2014-35](http://blog.commoncrawl.org/2014/09/august-2014-crawl-data-available/) | August 2014 | ~ 2.8 billion
11
- | [CC-MAIN-2014-23](http://blog.commoncrawl.org/2014/08/july-2014-crawl-data-available/) | July 2014 | ~ 3.6 billion
12
- | [CC-MAIN-2014-15](http://blog.commoncrawl.org/2014/07/april-2014-crawl-data-available/) | April 2014 | ~ 2.3 billion
6
+ | Crawl Name | Month | Web Pages | Segments
7
+ | -------------- |:---------:|:----------:|:-------:
8
+ | [CC-MAIN-2015-06](http://blog.commoncrawl.org/2015/03/january-2015-crawl-archive-available/) | January 2015 | ~ 1.82 billion | 98
9
+ | [CC-MAIN-2014-52](http://blog.commoncrawl.org/2015/01/december-2014-crawl-archive-available/) | December 2014 | ~ 2.08 billion | 314
10
+ | [CC-MAIN-2014-49](http://blog.commoncrawl.org/2014/12/november-2014-crawl-archive-available/) | November 2014 | ~ 1.95 billion | 136
11
+ | [CC-MAIN-2014-35](http://blog.commoncrawl.org/2014/09/august-2014-crawl-data-available/) | August 2014 | ~ 2.8 billion | 111
12
+ | [CC-MAIN-2014-23](http://blog.commoncrawl.org/2014/08/july-2014-crawl-data-available/) | July 2014 | ~ 3.6 billion | 253
13
+ | [CC-MAIN-2014-15](http://blog.commoncrawl.org/2014/07/april-2014-crawl-data-available/) | April 2014 | ~ 2.3 billion | 70
13
14
 
14
15
  Common Crawl announce new crawls on their [blog](http://blog.commoncrawl.org/).
15
16
 
@@ -157,11 +158,11 @@ stores your S3 bucket name and the config for the parse and combine jobs
157
158
 
158
159
  ## Development
159
160
 
160
- Elasticrawl is developed in Ruby and requires Ruby 1.9.3 or later (Ruby 2.1 is recommended). The sqlite3 and nokogiri gems have C extensions which mean you may need to install development headers.
161
+ Elasticrawl is developed in Ruby and requires Ruby 2.0.0 or later (Ruby 2.1 is recommended). The sqlite3 and nokogiri gems have C extensions which mean you may need to install development headers.
161
162
 
162
163
  [![Gem Version](https://badge.fury.io/rb/elasticrawl.png)](http://badge.fury.io/rb/elasticrawl)
163
164
  [![Code Climate](https://codeclimate.com/github/rossf7/elasticrawl.png)](https://codeclimate.com/github/rossf7/elasticrawl)
164
- [![Build Status](https://travis-ci.org/rossf7/elasticrawl.png?branch=master)](https://travis-ci.org/rossf7/elasticrawl) 1.9.3, 2.0.0, 2.1.5, 2.2.0
165
+ [![Build Status](https://travis-ci.org/rossf7/elasticrawl.png?branch=master)](https://travis-ci.org/rossf7/elasticrawl) 2.0.0, 2.1.5, 2.2.0
165
166
 
166
167
  The deployment packages are created using [Traveling Ruby](http://phusion.github.io/traveling-ruby/). The deploy packages contain a Ruby 2.1 interpreter, Gems and the compiled C extensions. The [traveling-elasticrawl](https://github.com/rossf7/traveling-elasticrawl) repository has a Rake task that automates building the deployment packages.
167
168
 
data/Vagrantfile CHANGED
@@ -39,16 +39,16 @@ Vagrant.configure("2") do |config|
39
39
  "user_installs" => [
40
40
  {
41
41
  "user" => "vagrant",
42
- "rubies" => ["1.9.3-p551", "2.0.0-p598", "2.1.5"],
42
+ "rubies" => ["2.0.0-p643", "2.1.5", "2.2.0"],
43
43
  "global" => "2.1.5",
44
44
  "gems" => {
45
- "1.9.3-p551" => [
45
+ "2.0.0-p643" => [
46
46
  { "name" => "bundler" }
47
47
  ],
48
- "2.0.0-p598" => [
48
+ "2.1.5" => [
49
49
  { "name" => "bundler" }
50
50
  ],
51
- "2.1.5" => [
51
+ "2.2.0" => [
52
52
  { "name" => "bundler" }
53
53
  ]
54
54
  }
@@ -40,10 +40,14 @@ HERE
40
40
  ec2_key_name = config_setting('ec2_key_name')
41
41
  placement = config_setting('placement')
42
42
  emr_ami_version = config_setting('emr_ami_version')
43
+ job_flow_role = config_setting('job_flow_role')
44
+ service_role = config_setting('service_role')
43
45
 
44
46
  job_flow.ec2_key_name = ec2_key_name if ec2_key_name.present?
45
47
  job_flow.placement = placement if placement.present?
46
48
  job_flow.ami_version = emr_ami_version if emr_ami_version.present?
49
+ job_flow.job_flow_role = job_flow_role if job_flow_role.present?
50
+ job_flow.service_role = service_role if service_role.present?
47
51
  end
48
52
 
49
53
  # Configures the instances that will be launched. The master group has
@@ -32,7 +32,7 @@ private
32
32
 
33
33
  URI::Generic.build(:scheme => 's3',
34
34
  :host => Elasticrawl::COMMON_CRAWL_BUCKET,
35
- :path => s3_path.join('/'))
35
+ :path => s3_path.join('/')).to_s
36
36
  end
37
37
  end
38
38
  end
@@ -1,3 +1,3 @@
1
1
  module Elasticrawl
2
- VERSION = '1.1.3'
2
+ VERSION = '1.1.4'
3
3
  end
@@ -42,3 +42,9 @@ placement: 'us-east-1a'
42
42
 
43
43
  # The AMI version to use when launching instances.
44
44
  emr_ami_version: 'latest'
45
+
46
+ # Default instance profile
47
+ job_flow_role: 'EMR_EC2_DefaultRole'
48
+
49
+ # Default service role
50
+ service_role: 'EMR_DefaultRole'
metadata CHANGED
@@ -1,181 +1,181 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: elasticrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.3
4
+ version: 1.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ross Fairbanks
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-02-05 00:00:00.000000000 Z
11
+ date: 2015-07-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ~>
18
18
  - !ruby/object:Gem::Version
19
19
  version: '4.2'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ~>
25
25
  - !ruby/object:Gem::Version
26
26
  version: '4.2'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: activesupport
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ~>
32
32
  - !ruby/object:Gem::Version
33
33
  version: '4.2'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ~>
39
39
  - !ruby/object:Gem::Version
40
40
  version: '4.2'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: aws-sdk
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - "~>"
45
+ - - ~>
46
46
  - !ruby/object:Gem::Version
47
47
  version: '1.60'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - "~>"
52
+ - - ~>
53
53
  - !ruby/object:Gem::Version
54
54
  version: '1.60'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: elasticity
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - "~>"
59
+ - - ~>
60
60
  - !ruby/object:Gem::Version
61
61
  version: '4.0'
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - "~>"
66
+ - - ~>
67
67
  - !ruby/object:Gem::Version
68
68
  version: '4.0'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: highline
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - "~>"
73
+ - - ~>
74
74
  - !ruby/object:Gem::Version
75
75
  version: '1.6'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - "~>"
80
+ - - ~>
81
81
  - !ruby/object:Gem::Version
82
82
  version: '1.6'
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: sqlite3
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - "~>"
87
+ - - ~>
88
88
  - !ruby/object:Gem::Version
89
89
  version: '1.3'
90
90
  type: :runtime
91
91
  prerelease: false
92
92
  version_requirements: !ruby/object:Gem::Requirement
93
93
  requirements:
94
- - - "~>"
94
+ - - ~>
95
95
  - !ruby/object:Gem::Version
96
96
  version: '1.3'
97
97
  - !ruby/object:Gem::Dependency
98
98
  name: thor
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
- - - "~>"
101
+ - - ~>
102
102
  - !ruby/object:Gem::Version
103
103
  version: '0.19'
104
104
  type: :runtime
105
105
  prerelease: false
106
106
  version_requirements: !ruby/object:Gem::Requirement
107
107
  requirements:
108
- - - "~>"
108
+ - - ~>
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0.19'
111
111
  - !ruby/object:Gem::Dependency
112
112
  name: rake
113
113
  requirement: !ruby/object:Gem::Requirement
114
114
  requirements:
115
- - - ">="
115
+ - - '>='
116
116
  - !ruby/object:Gem::Version
117
117
  version: '0'
118
118
  type: :development
119
119
  prerelease: false
120
120
  version_requirements: !ruby/object:Gem::Requirement
121
121
  requirements:
122
- - - ">="
122
+ - - '>='
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
125
  - !ruby/object:Gem::Dependency
126
126
  name: bundler
127
127
  requirement: !ruby/object:Gem::Requirement
128
128
  requirements:
129
- - - "~>"
129
+ - - ~>
130
130
  - !ruby/object:Gem::Version
131
131
  version: '1.3'
132
132
  type: :development
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
- - - "~>"
136
+ - - ~>
137
137
  - !ruby/object:Gem::Version
138
138
  version: '1.3'
139
139
  - !ruby/object:Gem::Dependency
140
140
  name: rspec
141
141
  requirement: !ruby/object:Gem::Requirement
142
142
  requirements:
143
- - - "~>"
143
+ - - ~>
144
144
  - !ruby/object:Gem::Version
145
145
  version: '3.1'
146
146
  type: :development
147
147
  prerelease: false
148
148
  version_requirements: !ruby/object:Gem::Requirement
149
149
  requirements:
150
- - - "~>"
150
+ - - ~>
151
151
  - !ruby/object:Gem::Version
152
152
  version: '3.1'
153
153
  - !ruby/object:Gem::Dependency
154
154
  name: database_cleaner
155
155
  requirement: !ruby/object:Gem::Requirement
156
156
  requirements:
157
- - - "~>"
157
+ - - ~>
158
158
  - !ruby/object:Gem::Version
159
159
  version: 1.3.0
160
160
  type: :development
161
161
  prerelease: false
162
162
  version_requirements: !ruby/object:Gem::Requirement
163
163
  requirements:
164
- - - "~>"
164
+ - - ~>
165
165
  - !ruby/object:Gem::Version
166
166
  version: 1.3.0
167
167
  - !ruby/object:Gem::Dependency
168
168
  name: shoulda-matchers
169
169
  requirement: !ruby/object:Gem::Requirement
170
170
  requirements:
171
- - - "~>"
171
+ - - ~>
172
172
  - !ruby/object:Gem::Version
173
173
  version: 2.7.0
174
174
  type: :development
175
175
  prerelease: false
176
176
  version_requirements: !ruby/object:Gem::Requirement
177
177
  requirements:
178
- - - "~>"
178
+ - - ~>
179
179
  - !ruby/object:Gem::Version
180
180
  version: 2.7.0
181
181
  description: Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process
@@ -187,8 +187,8 @@ executables:
187
187
  extensions: []
188
188
  extra_rdoc_files: []
189
189
  files:
190
- - ".gitignore"
191
- - ".travis.yml"
190
+ - .gitignore
191
+ - .travis.yml
192
192
  - CHANGELOG.md
193
193
  - Cheffile
194
194
  - Cheffile.lock
@@ -241,17 +241,17 @@ require_paths:
241
241
  - lib
242
242
  required_ruby_version: !ruby/object:Gem::Requirement
243
243
  requirements:
244
- - - ">="
244
+ - - '>='
245
245
  - !ruby/object:Gem::Version
246
246
  version: '0'
247
247
  required_rubygems_version: !ruby/object:Gem::Requirement
248
248
  requirements:
249
- - - ">="
249
+ - - '>='
250
250
  - !ruby/object:Gem::Version
251
251
  version: '0'
252
252
  requirements: []
253
253
  rubyforge_project:
254
- rubygems_version: 2.2.2
254
+ rubygems_version: 2.0.14
255
255
  signing_key:
256
256
  specification_version: 4
257
257
  summary: Launch AWS Elastic MapReduce jobs that process Common Crawl data.