common-crawl-index 0.0.1.alpha → 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ ## v0.0.1.alpha
2
+
3
+ * initial release
data/Gemfile CHANGED
@@ -4,4 +4,4 @@ source 'https://rubygems.org'
4
4
  gemspec
5
5
 
6
6
  gem 'aws-sdk'
7
- #gem 'open3'
7
+ gem 'addressable'
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # CommonCrawlIndex
2
2
 
3
- TODO: Write a gem description
3
+ Use this gem to access [Common Crawl URL Index](http://commoncrawl.org/common-crawl-url-index/) with ruby. You can get more information on format and original python implementation at https://github.com/trivio/common_crawl_index
4
4
 
5
5
  ## Installation
6
6
 
@@ -18,7 +18,33 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- TODO: Write usage instructions here
21
+ When using with Rails in `config/initializers/common_crawl_index.rb`
22
+
23
+ ```ruby
24
+ CommonCrawlIndex::Client.config({
25
+ :access_key_id => "amazon aws access_key",
26
+ :secret_access_key => "amazon aws secret_key",
27
+ :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
28
+ })
29
+ ```
30
+
31
+ And to find URLs matching certain prefix use following syntax
32
+
33
+ ```ruby
34
+ client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
35
+
36
+ # or
37
+
38
+ client = CommonCrawlIndex::Client.new() # already configured
39
+
40
+ url = "http://www.amazon.com/"
41
+
42
+ client.find_by_prefix(url) do |url_data|
43
+ # get all URLs starting with http://www.amazon.com/
44
+ end
45
+ ```
46
+
47
+ See `spec/basic_spec.rb` for more examples on usage.
22
48
 
23
49
  ## Contributing
24
50
 
@@ -10,7 +10,7 @@ Gem::Specification.new do |gem|
10
10
  gem.email = ["amitamb@gmail.com"]
11
11
  gem.description = %q{Access coomon crawl URL index}
12
12
  gem.summary = %q{Access coomon crawl URL index}
13
- gem.homepage = ""
13
+ gem.homepage = "https://github.com/VerticalSet/common-crawl-index"
14
14
 
15
15
  gem.add_development_dependency "rspec"
16
16
 
@@ -1,7 +1,7 @@
1
- require "common-crawl-index/version"
1
+ require 'common-crawl-index/version'
2
2
  require 'aws-sdk'
3
3
  require 'open3'
4
-
4
+ require 'addressable/uri'
5
5
 
6
6
  module CommonCrawlIndex
7
7
  class Client
@@ -42,7 +42,7 @@ module CommonCrawlIndex
42
42
 
43
43
  def self.normalize_url(url, append_scheme = true)
44
44
  url_to_find = url
45
- norm_url_to_find = URI(url_to_find)
45
+ norm_url_to_find = Addressable::URI.parse(url_to_find)
46
46
  norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
47
47
  norm_url = norm_url_to_find.to_s
48
48
  norm_url = norm_url[norm_url.index("\/\/")+2..-1]
@@ -58,7 +58,7 @@ module CommonCrawlIndex
58
58
  scheme = normalized_url[colon_index+1..-1] if colon_index
59
59
  end
60
60
  url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
61
- uri = URI(url_with_scheme)
61
+ uri = Addressable::URI.parse(url_with_scheme)
62
62
  uri.host = uri.host.split(".").reverse.join(".")
63
63
  uri.to_s
64
64
  end
@@ -119,6 +119,7 @@ module CommonCrawlIndex
119
119
  end
120
120
  cur_loc = nil_loc + 32 + 1
121
121
  end
122
+ true
122
123
  end
123
124
 
124
125
  def read(target_range)
@@ -1,3 +1,3 @@
1
1
  module CommonCrawlIndex
2
- VERSION = "0.0.1.alpha"
2
+ VERSION = "0.0.1"
3
3
  end
@@ -26,7 +26,7 @@ describe CommonCrawlIndex do
26
26
  it "should find by prefix" do
27
27
  client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
28
28
 
29
- total_urls_to_test = 100
29
+ total_urls_to_test = 2500
30
30
 
31
31
  url = "http://www.amazon.com/"
32
32
  normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
@@ -52,6 +52,9 @@ describe CommonCrawlIndex do
52
52
  it "should normalize the urls correctly" do
53
53
  normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
54
54
  normalized_url.should == "com.google.www/test/path:http"
55
+
56
+ normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8")
57
+ normalized_url.should == "com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http"
55
58
  end
56
59
 
57
60
  it "should normalize the urls correctly without scheme" do
@@ -62,6 +65,9 @@ describe CommonCrawlIndex do
62
65
  it "should denormalize the urls correctly" do
63
66
  url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
64
67
  url.should == "http://www.google.com/test/path"
68
+
69
+ url = CommonCrawlIndex::Client.denormalize_url("com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http")
70
+ url.should == "http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8"
65
71
  end
66
72
 
67
73
  it "should denormalize the urls correctly without scheme" do
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: common-crawl-index
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1.alpha
5
- prerelease: 6
4
+ version: 0.0.1
5
+ prerelease:
6
6
  platform: ruby
7
7
  authors:
8
8
  - Amit Ambardekar
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-23 00:00:00.000000000 Z
12
+ date: 2013-01-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -36,6 +36,7 @@ extra_rdoc_files: []
36
36
  files:
37
37
  - .gitignore
38
38
  - .rspec
39
+ - CHANGELOG.md
39
40
  - Gemfile
40
41
  - LICENSE.txt
41
42
  - README.md
@@ -45,7 +46,7 @@ files:
45
46
  - lib/common-crawl-index/version.rb
46
47
  - spec/basic_spec.rb
47
48
  - spec/spec_helper.rb
48
- homepage: ''
49
+ homepage: https://github.com/VerticalSet/common-crawl-index
49
50
  licenses: []
50
51
  post_install_message:
51
52
  rdoc_options: []
@@ -60,9 +61,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
60
61
  required_rubygems_version: !ruby/object:Gem::Requirement
61
62
  none: false
62
63
  requirements:
63
- - - ! '>'
64
+ - - ! '>='
64
65
  - !ruby/object:Gem::Version
65
- version: 1.3.1
66
+ version: '0'
66
67
  requirements: []
67
68
  rubyforge_project:
68
69
  rubygems_version: 1.8.24