elasticrawl 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +1 -1
- data/README.md +77 -108
- data/Vagrantfile +5 -5
- data/db/migrate/201401051536_create_crawls.rb +1 -1
- data/db/migrate/201401051855_create_crawl_segments.rb +1 -1
- data/db/migrate/201401101723_create_jobs.rb +1 -1
- data/db/migrate/201401141606_create_job_steps.rb +1 -1
- data/db/migrate/201412311554_add_file_count_to_crawl_segments.rb +5 -0
- data/elasticrawl.gemspec +10 -11
- data/lib/elasticrawl.rb +7 -0
- data/lib/elasticrawl/cluster.rb +1 -1
- data/lib/elasticrawl/crawl.rb +49 -31
- data/lib/elasticrawl/crawl_segment.rb +30 -0
- data/lib/elasticrawl/job.rb +13 -6
- data/lib/elasticrawl/job_step.rb +5 -3
- data/lib/elasticrawl/parse_job.rb +14 -0
- data/lib/elasticrawl/version.rb +1 -1
- data/spec/fixtures/warc.paths +6 -0
- data/spec/spec_helper.rb +8 -14
- data/spec/unit/cluster_spec.rb +2 -2
- data/spec/unit/combine_job_spec.rb +4 -4
- data/spec/unit/crawl_segment_spec.rb +19 -10
- data/spec/unit/crawl_spec.rb +21 -16
- data/spec/unit/job_step_spec.rb +4 -4
- data/spec/unit/parse_job_spec.rb +20 -14
- metadata +56 -101
metadata
CHANGED
@@ -1,224 +1,183 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: elasticrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
5
|
-
prerelease:
|
4
|
+
version: 1.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Ross Fairbanks
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2015-01-03 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: activerecord
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- - ~>
|
17
|
+
- - "~>"
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version: 4.
|
19
|
+
version: '4.2'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- - ~>
|
24
|
+
- - "~>"
|
28
25
|
- !ruby/object:Gem::Version
|
29
|
-
version: 4.
|
26
|
+
version: '4.2'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: activesupport
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- - ~>
|
31
|
+
- - "~>"
|
36
32
|
- !ruby/object:Gem::Version
|
37
|
-
version: 4.
|
33
|
+
version: '4.2'
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- - ~>
|
38
|
+
- - "~>"
|
44
39
|
- !ruby/object:Gem::Version
|
45
|
-
version: 4.
|
40
|
+
version: '4.2'
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: aws-sdk
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
|
-
- - ~>
|
45
|
+
- - "~>"
|
52
46
|
- !ruby/object:Gem::Version
|
53
|
-
version: '1.
|
47
|
+
version: '1.60'
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
|
-
- - ~>
|
52
|
+
- - "~>"
|
60
53
|
- !ruby/object:Gem::Version
|
61
|
-
version: '1.
|
54
|
+
version: '1.60'
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: elasticity
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
|
-
- - ~>
|
59
|
+
- - "~>"
|
68
60
|
- !ruby/object:Gem::Version
|
69
|
-
version: '
|
61
|
+
version: '4.0'
|
70
62
|
type: :runtime
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
|
-
- - ~>
|
66
|
+
- - "~>"
|
76
67
|
- !ruby/object:Gem::Version
|
77
|
-
version: '
|
68
|
+
version: '4.0'
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: highline
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
|
-
- - ~>
|
73
|
+
- - "~>"
|
84
74
|
- !ruby/object:Gem::Version
|
85
|
-
version: 1.6
|
75
|
+
version: '1.6'
|
86
76
|
type: :runtime
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
|
-
- - ~>
|
80
|
+
- - "~>"
|
92
81
|
- !ruby/object:Gem::Version
|
93
|
-
version: 1.6
|
82
|
+
version: '1.6'
|
94
83
|
- !ruby/object:Gem::Dependency
|
95
84
|
name: sqlite3
|
96
85
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
86
|
requirements:
|
99
|
-
- - ~>
|
87
|
+
- - "~>"
|
100
88
|
- !ruby/object:Gem::Version
|
101
|
-
version: 1.3
|
89
|
+
version: '1.3'
|
102
90
|
type: :runtime
|
103
91
|
prerelease: false
|
104
92
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
93
|
requirements:
|
107
|
-
- - ~>
|
94
|
+
- - "~>"
|
108
95
|
- !ruby/object:Gem::Version
|
109
|
-
version: 1.3
|
96
|
+
version: '1.3'
|
110
97
|
- !ruby/object:Gem::Dependency
|
111
98
|
name: thor
|
112
99
|
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
100
|
requirements:
|
115
|
-
- - ~>
|
101
|
+
- - "~>"
|
116
102
|
- !ruby/object:Gem::Version
|
117
|
-
version: 0.
|
103
|
+
version: '0.19'
|
118
104
|
type: :runtime
|
119
105
|
prerelease: false
|
120
106
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
107
|
requirements:
|
123
|
-
- - ~>
|
108
|
+
- - "~>"
|
124
109
|
- !ruby/object:Gem::Version
|
125
|
-
version: 0.
|
110
|
+
version: '0.19'
|
126
111
|
- !ruby/object:Gem::Dependency
|
127
112
|
name: rake
|
128
113
|
requirement: !ruby/object:Gem::Requirement
|
129
|
-
none: false
|
130
114
|
requirements:
|
131
|
-
- -
|
115
|
+
- - ">="
|
132
116
|
- !ruby/object:Gem::Version
|
133
117
|
version: '0'
|
134
118
|
type: :development
|
135
119
|
prerelease: false
|
136
120
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
121
|
requirements:
|
139
|
-
- -
|
122
|
+
- - ">="
|
140
123
|
- !ruby/object:Gem::Version
|
141
124
|
version: '0'
|
142
125
|
- !ruby/object:Gem::Dependency
|
143
126
|
name: bundler
|
144
127
|
requirement: !ruby/object:Gem::Requirement
|
145
|
-
none: false
|
146
128
|
requirements:
|
147
|
-
- - ~>
|
129
|
+
- - "~>"
|
148
130
|
- !ruby/object:Gem::Version
|
149
131
|
version: '1.3'
|
150
132
|
type: :development
|
151
133
|
prerelease: false
|
152
134
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
135
|
requirements:
|
155
|
-
- - ~>
|
136
|
+
- - "~>"
|
156
137
|
- !ruby/object:Gem::Version
|
157
138
|
version: '1.3'
|
158
139
|
- !ruby/object:Gem::Dependency
|
159
140
|
name: rspec
|
160
141
|
requirement: !ruby/object:Gem::Requirement
|
161
|
-
none: false
|
162
|
-
requirements:
|
163
|
-
- - ~>
|
164
|
-
- !ruby/object:Gem::Version
|
165
|
-
version: 2.14.1
|
166
|
-
type: :development
|
167
|
-
prerelease: false
|
168
|
-
version_requirements: !ruby/object:Gem::Requirement
|
169
|
-
none: false
|
170
|
-
requirements:
|
171
|
-
- - ~>
|
172
|
-
- !ruby/object:Gem::Version
|
173
|
-
version: 2.14.1
|
174
|
-
- !ruby/object:Gem::Dependency
|
175
|
-
name: mocha
|
176
|
-
requirement: !ruby/object:Gem::Requirement
|
177
|
-
none: false
|
178
142
|
requirements:
|
179
|
-
- - ~>
|
143
|
+
- - "~>"
|
180
144
|
- !ruby/object:Gem::Version
|
181
|
-
version: 1
|
145
|
+
version: '3.1'
|
182
146
|
type: :development
|
183
147
|
prerelease: false
|
184
148
|
version_requirements: !ruby/object:Gem::Requirement
|
185
|
-
none: false
|
186
149
|
requirements:
|
187
|
-
- - ~>
|
150
|
+
- - "~>"
|
188
151
|
- !ruby/object:Gem::Version
|
189
|
-
version: 1
|
152
|
+
version: '3.1'
|
190
153
|
- !ruby/object:Gem::Dependency
|
191
154
|
name: database_cleaner
|
192
155
|
requirement: !ruby/object:Gem::Requirement
|
193
|
-
none: false
|
194
156
|
requirements:
|
195
|
-
- - ~>
|
157
|
+
- - "~>"
|
196
158
|
- !ruby/object:Gem::Version
|
197
|
-
version: 1.
|
159
|
+
version: 1.3.0
|
198
160
|
type: :development
|
199
161
|
prerelease: false
|
200
162
|
version_requirements: !ruby/object:Gem::Requirement
|
201
|
-
none: false
|
202
163
|
requirements:
|
203
|
-
- - ~>
|
164
|
+
- - "~>"
|
204
165
|
- !ruby/object:Gem::Version
|
205
|
-
version: 1.
|
166
|
+
version: 1.3.0
|
206
167
|
- !ruby/object:Gem::Dependency
|
207
168
|
name: shoulda-matchers
|
208
169
|
requirement: !ruby/object:Gem::Requirement
|
209
|
-
none: false
|
210
170
|
requirements:
|
211
|
-
- - ~>
|
171
|
+
- - "~>"
|
212
172
|
- !ruby/object:Gem::Version
|
213
|
-
version: 2.
|
173
|
+
version: 2.7.0
|
214
174
|
type: :development
|
215
175
|
prerelease: false
|
216
176
|
version_requirements: !ruby/object:Gem::Requirement
|
217
|
-
none: false
|
218
177
|
requirements:
|
219
|
-
- - ~>
|
178
|
+
- - "~>"
|
220
179
|
- !ruby/object:Gem::Version
|
221
|
-
version: 2.
|
180
|
+
version: 2.7.0
|
222
181
|
description: Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process
|
223
182
|
Common Crawl data.
|
224
183
|
email:
|
@@ -228,8 +187,8 @@ executables:
|
|
228
187
|
extensions: []
|
229
188
|
extra_rdoc_files: []
|
230
189
|
files:
|
231
|
-
- .gitignore
|
232
|
-
- .travis.yml
|
190
|
+
- ".gitignore"
|
191
|
+
- ".travis.yml"
|
233
192
|
- Cheffile
|
234
193
|
- Cheffile.lock
|
235
194
|
- Gemfile
|
@@ -242,6 +201,7 @@ files:
|
|
242
201
|
- db/migrate/201401051855_create_crawl_segments.rb
|
243
202
|
- db/migrate/201401101723_create_jobs.rb
|
244
203
|
- db/migrate/201401141606_create_job_steps.rb
|
204
|
+
- db/migrate/201412311554_add_file_count_to_crawl_segments.rb
|
245
205
|
- elasticrawl.gemspec
|
246
206
|
- lib/elasticrawl.rb
|
247
207
|
- lib/elasticrawl/cluster.rb
|
@@ -257,6 +217,7 @@ files:
|
|
257
217
|
- spec/fixtures/aws.yml
|
258
218
|
- spec/fixtures/cluster.yml
|
259
219
|
- spec/fixtures/jobs.yml
|
220
|
+
- spec/fixtures/warc.paths
|
260
221
|
- spec/spec_helper.rb
|
261
222
|
- spec/unit/cluster_spec.rb
|
262
223
|
- spec/unit/combine_job_spec.rb
|
@@ -272,38 +233,32 @@ files:
|
|
272
233
|
homepage: https://github.com/rossf7/elasticrawl
|
273
234
|
licenses:
|
274
235
|
- MIT
|
236
|
+
metadata: {}
|
275
237
|
post_install_message:
|
276
238
|
rdoc_options: []
|
277
239
|
require_paths:
|
278
240
|
- lib
|
279
241
|
required_ruby_version: !ruby/object:Gem::Requirement
|
280
|
-
none: false
|
281
242
|
requirements:
|
282
|
-
- -
|
243
|
+
- - ">="
|
283
244
|
- !ruby/object:Gem::Version
|
284
245
|
version: '0'
|
285
|
-
segments:
|
286
|
-
- 0
|
287
|
-
hash: -3344138865650739079
|
288
246
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
289
|
-
none: false
|
290
247
|
requirements:
|
291
|
-
- -
|
248
|
+
- - ">="
|
292
249
|
- !ruby/object:Gem::Version
|
293
250
|
version: '0'
|
294
|
-
segments:
|
295
|
-
- 0
|
296
|
-
hash: -3344138865650739079
|
297
251
|
requirements: []
|
298
252
|
rubyforge_project:
|
299
|
-
rubygems_version:
|
253
|
+
rubygems_version: 2.2.2
|
300
254
|
signing_key:
|
301
|
-
specification_version:
|
255
|
+
specification_version: 4
|
302
256
|
summary: Launch AWS Elastic MapReduce jobs that process Common Crawl data.
|
303
257
|
test_files:
|
304
258
|
- spec/fixtures/aws.yml
|
305
259
|
- spec/fixtures/cluster.yml
|
306
260
|
- spec/fixtures/jobs.yml
|
261
|
+
- spec/fixtures/warc.paths
|
307
262
|
- spec/spec_helper.rb
|
308
263
|
- spec/unit/cluster_spec.rb
|
309
264
|
- spec/unit/combine_job_spec.rb
|