common-crawl-index 0.0.1 → 0.0.2.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -1
- data/README.md +2 -28
- data/lib/common-crawl-index.rb +4 -5
- data/lib/common-crawl-index/version.rb +1 -1
- data/spec/basic_spec.rb +1 -7
- metadata +5 -5
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# CommonCrawlIndex
|
2
2
|
|
3
|
-
|
3
|
+
TODO: Write a gem description
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -18,33 +18,7 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
## Usage
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
```ruby
|
24
|
-
CommonCrawlIndex::Client.config({
|
25
|
-
:access_key_id => "amazon aws access_key",
|
26
|
-
:secret_access_key => "amazon aws secret_key",
|
27
|
-
:cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
|
28
|
-
})
|
29
|
-
```
|
30
|
-
|
31
|
-
And to find URLs matching certain prefix use following syntax
|
32
|
-
|
33
|
-
```ruby
|
34
|
-
client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
|
35
|
-
|
36
|
-
# or
|
37
|
-
|
38
|
-
client = CommonCrawlIndex::Client.new() # already configured
|
39
|
-
|
40
|
-
url = "http://www.amazon.com/"
|
41
|
-
|
42
|
-
client.find_by_prefix(url) do |url_data|
|
43
|
-
# get all URLs starting with http://www.amazon.com/
|
44
|
-
end
|
45
|
-
```
|
46
|
-
|
47
|
-
See `spec/basic_spec.rb` for more examples on usage.
|
21
|
+
TODO: Write usage instructions here
|
48
22
|
|
49
23
|
## Contributing
|
50
24
|
|
data/lib/common-crawl-index.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require
|
1
|
+
require "common-crawl-index/version"
|
2
2
|
require 'aws-sdk'
|
3
3
|
require 'open3'
|
4
|
-
|
4
|
+
|
5
5
|
|
6
6
|
module CommonCrawlIndex
|
7
7
|
class Client
|
@@ -42,7 +42,7 @@ module CommonCrawlIndex
|
|
42
42
|
|
43
43
|
def self.normalize_url(url, append_scheme = true)
|
44
44
|
url_to_find = url
|
45
|
-
norm_url_to_find =
|
45
|
+
norm_url_to_find = URI(url_to_find)
|
46
46
|
norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
|
47
47
|
norm_url = norm_url_to_find.to_s
|
48
48
|
norm_url = norm_url[norm_url.index("\/\/")+2..-1]
|
@@ -58,7 +58,7 @@ module CommonCrawlIndex
|
|
58
58
|
scheme = normalized_url[colon_index+1..-1] if colon_index
|
59
59
|
end
|
60
60
|
url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
|
61
|
-
uri =
|
61
|
+
uri = URI(url_with_scheme)
|
62
62
|
uri.host = uri.host.split(".").reverse.join(".")
|
63
63
|
uri.to_s
|
64
64
|
end
|
@@ -119,7 +119,6 @@ module CommonCrawlIndex
|
|
119
119
|
end
|
120
120
|
cur_loc = nil_loc + 32 + 1
|
121
121
|
end
|
122
|
-
true
|
123
122
|
end
|
124
123
|
|
125
124
|
def read(target_range)
|
data/spec/basic_spec.rb
CHANGED
@@ -26,7 +26,7 @@ describe CommonCrawlIndex do
|
|
26
26
|
it "should find by prefix" do
|
27
27
|
client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
|
28
28
|
|
29
|
-
total_urls_to_test =
|
29
|
+
total_urls_to_test = 100
|
30
30
|
|
31
31
|
url = "http://www.amazon.com/"
|
32
32
|
normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
|
@@ -52,9 +52,6 @@ describe CommonCrawlIndex do
|
|
52
52
|
it "should normalize the urls correctly" do
|
53
53
|
normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
|
54
54
|
normalized_url.should == "com.google.www/test/path:http"
|
55
|
-
|
56
|
-
normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8")
|
57
|
-
normalized_url.should == "com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http"
|
58
55
|
end
|
59
56
|
|
60
57
|
it "should normalize the urls correctly without scheme" do
|
@@ -65,9 +62,6 @@ describe CommonCrawlIndex do
|
|
65
62
|
it "should denormalize the urls correctly" do
|
66
63
|
url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
|
67
64
|
url.should == "http://www.google.com/test/path"
|
68
|
-
|
69
|
-
url = CommonCrawlIndex::Client.denormalize_url("com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http")
|
70
|
-
url.should == "http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8"
|
71
65
|
end
|
72
66
|
|
73
67
|
it "should denormalize the urls correctly without scheme" do
|
metadata
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: common-crawl-index
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.2.alpha
|
5
|
+
prerelease: 6
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Amit Ambardekar
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-01-
|
12
|
+
date: 2013-01-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -61,9 +61,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
61
61
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
62
|
none: false
|
63
63
|
requirements:
|
64
|
-
- - ! '
|
64
|
+
- - ! '>'
|
65
65
|
- !ruby/object:Gem::Version
|
66
|
-
version:
|
66
|
+
version: 1.3.1
|
67
67
|
requirements: []
|
68
68
|
rubyforge_project:
|
69
69
|
rubygems_version: 1.8.24
|