common-crawl-index 0.0.1.alpha → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +3 -0
- data/Gemfile +1 -1
- data/README.md +28 -2
- data/common-crawl-index.gemspec +1 -1
- data/lib/common-crawl-index.rb +5 -4
- data/lib/common-crawl-index/version.rb +1 -1
- data/spec/basic_spec.rb +7 -1
- metadata +7 -6
data/CHANGELOG.md
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# CommonCrawlIndex
|
2
2
|
|
3
|
-
|
3
|
+
Use this gem to access [Common Crawl URL Index](http://commoncrawl.org/common-crawl-url-index/) with ruby. You can get more information on format and original python implementation at https://github.com/trivio/common_crawl_index
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -18,7 +18,33 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
|
21
|
+
When using with Rails in `config/initializers/common_crawl_index.rb`
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
CommonCrawlIndex::Client.config({
|
25
|
+
:access_key_id => "amazon aws access_key",
|
26
|
+
:secret_access_key => "amazon aws secret_key",
|
27
|
+
:cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
|
28
|
+
})
|
29
|
+
```
|
30
|
+
|
31
|
+
And to find URLs matching certain prefix use following syntax
|
32
|
+
|
33
|
+
```ruby
|
34
|
+
client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
|
35
|
+
|
36
|
+
# or
|
37
|
+
|
38
|
+
client = CommonCrawlIndex::Client.new() # already configured
|
39
|
+
|
40
|
+
url = "http://www.amazon.com/"
|
41
|
+
|
42
|
+
client.find_by_prefix(url) do |url_data|
|
43
|
+
# get all URLs starting with http://www.amazon.com/
|
44
|
+
end
|
45
|
+
```
|
46
|
+
|
47
|
+
See `spec/basic_spec.rb` for more examples on usage.
|
22
48
|
|
23
49
|
## Contributing
|
24
50
|
|
data/common-crawl-index.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |gem|
|
|
10
10
|
gem.email = ["amitamb@gmail.com"]
|
11
11
|
gem.description = %q{Access coomon crawl URL index}
|
12
12
|
gem.summary = %q{Access coomon crawl URL index}
|
13
|
-
gem.homepage = ""
|
13
|
+
gem.homepage = "https://github.com/VerticalSet/common-crawl-index"
|
14
14
|
|
15
15
|
gem.add_development_dependency "rspec"
|
16
16
|
|
data/lib/common-crawl-index.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require
|
1
|
+
require 'common-crawl-index/version'
|
2
2
|
require 'aws-sdk'
|
3
3
|
require 'open3'
|
4
|
-
|
4
|
+
require 'addressable/uri'
|
5
5
|
|
6
6
|
module CommonCrawlIndex
|
7
7
|
class Client
|
@@ -42,7 +42,7 @@ module CommonCrawlIndex
|
|
42
42
|
|
43
43
|
def self.normalize_url(url, append_scheme = true)
|
44
44
|
url_to_find = url
|
45
|
-
norm_url_to_find = URI(url_to_find)
|
45
|
+
norm_url_to_find = Addressable::URI.parse(url_to_find)
|
46
46
|
norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
|
47
47
|
norm_url = norm_url_to_find.to_s
|
48
48
|
norm_url = norm_url[norm_url.index("\/\/")+2..-1]
|
@@ -58,7 +58,7 @@ module CommonCrawlIndex
|
|
58
58
|
scheme = normalized_url[colon_index+1..-1] if colon_index
|
59
59
|
end
|
60
60
|
url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
|
61
|
-
uri = URI(url_with_scheme)
|
61
|
+
uri = Addressable::URI.parse(url_with_scheme)
|
62
62
|
uri.host = uri.host.split(".").reverse.join(".")
|
63
63
|
uri.to_s
|
64
64
|
end
|
@@ -119,6 +119,7 @@ module CommonCrawlIndex
|
|
119
119
|
end
|
120
120
|
cur_loc = nil_loc + 32 + 1
|
121
121
|
end
|
122
|
+
true
|
122
123
|
end
|
123
124
|
|
124
125
|
def read(target_range)
|
data/spec/basic_spec.rb
CHANGED
@@ -26,7 +26,7 @@ describe CommonCrawlIndex do
|
|
26
26
|
it "should find by prefix" do
|
27
27
|
client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
|
28
28
|
|
29
|
-
total_urls_to_test =
|
29
|
+
total_urls_to_test = 2500
|
30
30
|
|
31
31
|
url = "http://www.amazon.com/"
|
32
32
|
normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
|
@@ -52,6 +52,9 @@ describe CommonCrawlIndex do
|
|
52
52
|
it "should normalize the urls correctly" do
|
53
53
|
normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
|
54
54
|
normalized_url.should == "com.google.www/test/path:http"
|
55
|
+
|
56
|
+
normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8")
|
57
|
+
normalized_url.should == "com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http"
|
55
58
|
end
|
56
59
|
|
57
60
|
it "should normalize the urls correctly without scheme" do
|
@@ -62,6 +65,9 @@ describe CommonCrawlIndex do
|
|
62
65
|
it "should denormalize the urls correctly" do
|
63
66
|
url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
|
64
67
|
url.should == "http://www.google.com/test/path"
|
68
|
+
|
69
|
+
url = CommonCrawlIndex::Client.denormalize_url("com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http")
|
70
|
+
url.should == "http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8"
|
65
71
|
end
|
66
72
|
|
67
73
|
it "should denormalize the urls correctly without scheme" do
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: common-crawl-index
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.1
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Amit Ambardekar
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-01-
|
12
|
+
date: 2013-01-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -36,6 +36,7 @@ extra_rdoc_files: []
|
|
36
36
|
files:
|
37
37
|
- .gitignore
|
38
38
|
- .rspec
|
39
|
+
- CHANGELOG.md
|
39
40
|
- Gemfile
|
40
41
|
- LICENSE.txt
|
41
42
|
- README.md
|
@@ -45,7 +46,7 @@ files:
|
|
45
46
|
- lib/common-crawl-index/version.rb
|
46
47
|
- spec/basic_spec.rb
|
47
48
|
- spec/spec_helper.rb
|
48
|
-
homepage:
|
49
|
+
homepage: https://github.com/VerticalSet/common-crawl-index
|
49
50
|
licenses: []
|
50
51
|
post_install_message:
|
51
52
|
rdoc_options: []
|
@@ -60,9 +61,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
60
61
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
62
|
none: false
|
62
63
|
requirements:
|
63
|
-
- - ! '
|
64
|
+
- - ! '>='
|
64
65
|
- !ruby/object:Gem::Version
|
65
|
-
version:
|
66
|
+
version: '0'
|
66
67
|
requirements: []
|
67
68
|
rubyforge_project:
|
68
69
|
rubygems_version: 1.8.24
|