elasticrawl 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +3 -4
- data/CHANGELOG.md +3 -0
- data/Gemfile +1 -1
- data/README.md +11 -21
- data/elasticrawl.gemspec +1 -1
- data/lib/elasticrawl.rb +2 -2
- data/lib/elasticrawl/version.rb +1 -1
- data/spec/unit/crawl_segment_spec.rb +1 -1
- data/spec/unit/crawl_spec.rb +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0d1d5521ae9a6e0762c0b057bbd828f6c15be0c4
|
4
|
+
data.tar.gz: df0386cf340ac6aff20bc95bffcfa0f0fb3995ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 58f9f46e73d3bf03da4bfad2260cca0663430bd50187ed4927f564d15ac2b2acd377a76dd9016978eae8eb80caee3a76d610d0b524b010bbaa5a7cd953fdbbc9
|
7
|
+
data.tar.gz: e415324ccadc507ac37ddeaca0ae0ca71e9b6b93b008d23b2edefb985bd98928afe65ae508ad9d7ad68d532e936259bc36c6fdebce3806db2b6d39215d4dbf6e
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -13,19 +13,11 @@ This [blog post](https://rossfairbanks.com/2015/01/03/parsing-common-crawl-using
|
|
13
13
|
|
14
14
|
## Installation
|
15
15
|
|
16
|
-
|
16
|
+
* Elasticrawl needs a [Ruby installation](https://www.ruby-lang.org/en/documentation/installation/) (2.1 or higher).
|
17
|
+
* Install Ruby from RubyGems.
|
17
18
|
|
18
|
-
```
|
19
|
-
|
20
|
-
# Linux (64-bit) https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.5-linux-x86_64.tar.gz
|
21
|
-
# Linux (32-bit) https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.5-linux-x86.tar.gz
|
22
|
-
|
23
|
-
# e.g.
|
24
|
-
|
25
|
-
curl -O https://d2ujrnticqzebc.cloudfront.net/elasticrawl-1.1.5-osx.tar.gz
|
26
|
-
tar -xzf elasticrawl-1.1.5-osx.tar.gz
|
27
|
-
cd elasticrawl-1.1.5-osx/
|
28
|
-
./elasticrawl --help
|
19
|
+
```
|
20
|
+
gem install elasticrawl --no-rdoc --no-ri
|
29
21
|
```
|
30
22
|
|
31
23
|
### Troubleshooting
|
@@ -45,7 +37,7 @@ The init command takes in an S3 bucket name and your AWS credentials. The S3 buc
|
|
45
37
|
and will store your data and logs.
|
46
38
|
|
47
39
|
```bash
|
48
|
-
~$
|
40
|
+
~$ elasticrawl init your-s3-bucket
|
49
41
|
|
50
42
|
Enter AWS Access Key ID: ************
|
51
43
|
Enter AWS Secret Access Key: ************
|
@@ -62,7 +54,7 @@ Config complete
|
|
62
54
|
The parse command takes in the crawl name and an optional number of segments and files to parse.
|
63
55
|
|
64
56
|
```bash
|
65
|
-
~$
|
57
|
+
~$ elasticrawl parse CC-MAIN-2015-48 --max-segments 2 --max-files 3
|
66
58
|
Segments
|
67
59
|
Segment: 1416400372202.67 Files: 150
|
68
60
|
Segment: 1416400372490.23 Files: 124
|
@@ -85,7 +77,7 @@ Job: 1420124830792 Job Flow ID: j-2R3MFE6TWLIUB
|
|
85
77
|
The combine command takes in the results of previous parse jobs and produces a combined set of results.
|
86
78
|
|
87
79
|
```bash
|
88
|
-
~$
|
80
|
+
~$ elasticrawl combine --input-jobs 1420124830792
|
89
81
|
Job configuration
|
90
82
|
Combining: 2 segments
|
91
83
|
|
@@ -104,7 +96,7 @@ Job: 1420129496115 Job Flow ID: j-251GXDIZGK8HL
|
|
104
96
|
The status command shows crawls and your job history.
|
105
97
|
|
106
98
|
```bash
|
107
|
-
~$
|
99
|
+
~$ elasticrawl status
|
108
100
|
Crawl Status
|
109
101
|
CC-MAIN-2015-48 Segments: to parse 98, parsed 2, total 100
|
110
102
|
|
@@ -117,7 +109,7 @@ Job History (last 10)
|
|
117
109
|
The reset comment resets a crawl so it is parsed again.
|
118
110
|
|
119
111
|
```bash
|
120
|
-
~$
|
112
|
+
~$ elasticrawl reset CC-MAIN-2015-48
|
121
113
|
Reset crawl? (y/n)
|
122
114
|
y
|
123
115
|
CC-MAIN-2015-48 Segments: to parse 100, parsed 0, total 100
|
@@ -128,7 +120,7 @@ y
|
|
128
120
|
The destroy command deletes your S3 bucket and the ~/.elasticrawl directory.
|
129
121
|
|
130
122
|
```bash
|
131
|
-
~$
|
123
|
+
~$ elasticrawl destroy
|
132
124
|
|
133
125
|
WARNING:
|
134
126
|
Bucket s3://elasticrawl-test and its data will be deleted
|
@@ -158,14 +150,12 @@ stores your S3 bucket name and the config for the parse and combine jobs
|
|
158
150
|
|
159
151
|
## Development
|
160
152
|
|
161
|
-
Elasticrawl is developed in Ruby and requires Ruby 2.
|
153
|
+
Elasticrawl is developed in Ruby and requires Ruby 2.1.0 or later (Ruby 2.3 is recommended). The sqlite3 and nokogiri gems have C extensions which mean you may need to install development headers.
|
162
154
|
|
163
155
|
[](http://badge.fury.io/rb/elasticrawl)
|
164
156
|
[](https://codeclimate.com/github/rossf7/elasticrawl)
|
165
157
|
[](https://travis-ci.org/rossf7/elasticrawl) 2.0.0, 2.1.8, 2.2.4, 2.3.0
|
166
158
|
|
167
|
-
The deployment packages are created using [Traveling Ruby](http://phusion.github.io/traveling-ruby/). The deploy packages contain a Ruby 2.2 interpreter, Gems and the compiled C extensions. The [traveling-elasticrawl](https://github.com/rossf7/traveling-elasticrawl) repository has a Rake task that automates building the deployment packages.
|
168
|
-
|
169
159
|
## TODO
|
170
160
|
|
171
161
|
* Add support for Streaming and Pig jobs
|
data/elasticrawl.gemspec
CHANGED
@@ -27,7 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_dependency 'thor', '~> 0.19.1'
|
28
28
|
|
29
29
|
spec.add_development_dependency 'rake', '~> 10.4.2'
|
30
|
-
spec.add_development_dependency 'bundler', '~> 1.
|
30
|
+
spec.add_development_dependency 'bundler', '~> 1.12.5'
|
31
31
|
spec.add_development_dependency 'rspec', '~> 3.4.0'
|
32
32
|
spec.add_development_dependency 'database_cleaner', '~> 1.5.1'
|
33
33
|
spec.add_development_dependency 'shoulda-matchers', '~> 3.0.1'
|
data/lib/elasticrawl.rb
CHANGED
@@ -7,8 +7,8 @@ require 'thor'
|
|
7
7
|
|
8
8
|
module Elasticrawl
|
9
9
|
# S3 locations
|
10
|
-
COMMON_CRAWL_BUCKET = '
|
11
|
-
COMMON_CRAWL_PATH = '
|
10
|
+
COMMON_CRAWL_BUCKET = 'commoncrawl'
|
11
|
+
COMMON_CRAWL_PATH = 'crawl-data'
|
12
12
|
SEGMENTS_PATH = 'segments'
|
13
13
|
WARC_PATHS = 'warc.paths.gz'
|
14
14
|
MAX_SEGMENTS = 256
|
data/lib/elasticrawl/version.rb
CHANGED
@@ -22,7 +22,7 @@ describe Elasticrawl::CrawlSegment, type: :model do
|
|
22
22
|
|
23
23
|
it 'should have an s3 uri' do
|
24
24
|
expect(subject.segment_s3_uri).to eq \
|
25
|
-
"s3://
|
25
|
+
"s3://commoncrawl/crawl-data/#{crawl.crawl_name}/segments/#{segment_name}/"
|
26
26
|
end
|
27
27
|
|
28
28
|
it 'should have a file count' do
|
data/spec/unit/crawl_spec.rb
CHANGED
@@ -35,7 +35,7 @@ describe Elasticrawl::Crawl, type: :model do
|
|
35
35
|
|
36
36
|
it 'should create segment s3 uris' do
|
37
37
|
expect(subject.crawl_segments[0].segment_s3_uri).to eq \
|
38
|
-
's3://
|
38
|
+
's3://commoncrawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/'
|
39
39
|
end
|
40
40
|
|
41
41
|
it 'should set file counts' do
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: elasticrawl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ross Fairbanks
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-06-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
@@ -128,14 +128,14 @@ dependencies:
|
|
128
128
|
requirements:
|
129
129
|
- - "~>"
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: 1.
|
131
|
+
version: 1.12.5
|
132
132
|
type: :development
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: 1.
|
138
|
+
version: 1.12.5
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
140
|
name: rspec
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -251,7 +251,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
251
251
|
version: '0'
|
252
252
|
requirements: []
|
253
253
|
rubyforge_project:
|
254
|
-
rubygems_version: 2.
|
254
|
+
rubygems_version: 2.5.1
|
255
255
|
signing_key:
|
256
256
|
specification_version: 4
|
257
257
|
summary: Launch AWS Elastic MapReduce jobs that process Common Crawl data.
|