common-crawl-index 0.0.1.alpha → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ ## v0.0.1.alpha
2
+
3
+ * initial release
data/Gemfile CHANGED
@@ -4,4 +4,4 @@ source 'https://rubygems.org'
4
4
  gemspec
5
5
 
6
6
  gem 'aws-sdk'
7
- #gem 'open3'
7
+ gem 'addressable'
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # CommonCrawlIndex
2
2
 
3
- TODO: Write a gem description
3
+ Use this gem to access [Common Crawl URL Index](http://commoncrawl.org/common-crawl-url-index/) with ruby. You can get more information on format and original python implementation at https://github.com/trivio/common_crawl_index
4
4
 
5
5
  ## Installation
6
6
 
@@ -18,7 +18,33 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- TODO: Write usage instructions here
21
+ When using with Rails in `config/initializers/common_crawl_index.rb`
22
+
23
+ ```ruby
24
+ CommonCrawlIndex::Client.config({
25
+ :access_key_id => "amazon aws access_key",
26
+ :secret_access_key => "amazon aws secret_key",
27
+ :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
28
+ })
29
+ ```
30
+
31
+ And to find URLs matching certain prefix use following syntax
32
+
33
+ ```ruby
34
+ client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
35
+
36
+ # or
37
+
38
+ client = CommonCrawlIndex::Client.new() # already configured
39
+
40
+ url = "http://www.amazon.com/"
41
+
42
+ client.find_by_prefix(url) do |url_data|
43
+ # get all URLs starting with http://www.amazon.com/
44
+ end
45
+ ```
46
+
47
+ See `spec/basic_spec.rb` for more examples on usage.
22
48
 
23
49
  ## Contributing
24
50
 
@@ -10,7 +10,7 @@ Gem::Specification.new do |gem|
10
10
  gem.email = ["amitamb@gmail.com"]
11
11
  gem.description = %q{Access coomon crawl URL index}
12
12
  gem.summary = %q{Access coomon crawl URL index}
13
- gem.homepage = ""
13
+ gem.homepage = "https://github.com/VerticalSet/common-crawl-index"
14
14
 
15
15
  gem.add_development_dependency "rspec"
16
16
 
@@ -1,7 +1,7 @@
1
- require "common-crawl-index/version"
1
+ require 'common-crawl-index/version'
2
2
  require 'aws-sdk'
3
3
  require 'open3'
4
-
4
+ require 'addressable/uri'
5
5
 
6
6
  module CommonCrawlIndex
7
7
  class Client
@@ -42,7 +42,7 @@ module CommonCrawlIndex
42
42
 
43
43
  def self.normalize_url(url, append_scheme = true)
44
44
  url_to_find = url
45
- norm_url_to_find = URI(url_to_find)
45
+ norm_url_to_find = Addressable::URI.parse(url_to_find)
46
46
  norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
47
47
  norm_url = norm_url_to_find.to_s
48
48
  norm_url = norm_url[norm_url.index("\/\/")+2..-1]
@@ -58,7 +58,7 @@ module CommonCrawlIndex
58
58
  scheme = normalized_url[colon_index+1..-1] if colon_index
59
59
  end
60
60
  url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
61
- uri = URI(url_with_scheme)
61
+ uri = Addressable::URI.parse(url_with_scheme)
62
62
  uri.host = uri.host.split(".").reverse.join(".")
63
63
  uri.to_s
64
64
  end
@@ -119,6 +119,7 @@ module CommonCrawlIndex
119
119
  end
120
120
  cur_loc = nil_loc + 32 + 1
121
121
  end
122
+ true
122
123
  end
123
124
 
124
125
  def read(target_range)
@@ -1,3 +1,3 @@
1
1
  module CommonCrawlIndex
2
- VERSION = "0.0.1.alpha"
2
+ VERSION = "0.0.1"
3
3
  end
@@ -26,7 +26,7 @@ describe CommonCrawlIndex do
26
26
  it "should find by prefix" do
27
27
  client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
28
28
 
29
- total_urls_to_test = 100
29
+ total_urls_to_test = 2500
30
30
 
31
31
  url = "http://www.amazon.com/"
32
32
  normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
@@ -52,6 +52,9 @@ describe CommonCrawlIndex do
52
52
  it "should normalize the urls correctly" do
53
53
  normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
54
54
  normalized_url.should == "com.google.www/test/path:http"
55
+
56
+ normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8")
57
+ normalized_url.should == "com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http"
55
58
  end
56
59
 
57
60
  it "should normalize the urls correctly without scheme" do
@@ -62,6 +65,9 @@ describe CommonCrawlIndex do
62
65
  it "should denormalize the urls correctly" do
63
66
  url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
64
67
  url.should == "http://www.google.com/test/path"
68
+
69
+ url = CommonCrawlIndex::Client.denormalize_url("com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http")
70
+ url.should == "http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8"
65
71
  end
66
72
 
67
73
  it "should denormalize the urls correctly without scheme" do
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: common-crawl-index
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1.alpha
5
- prerelease: 6
4
+ version: 0.0.1
5
+ prerelease:
6
6
  platform: ruby
7
7
  authors:
8
8
  - Amit Ambardekar
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-23 00:00:00.000000000 Z
12
+ date: 2013-01-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -36,6 +36,7 @@ extra_rdoc_files: []
36
36
  files:
37
37
  - .gitignore
38
38
  - .rspec
39
+ - CHANGELOG.md
39
40
  - Gemfile
40
41
  - LICENSE.txt
41
42
  - README.md
@@ -45,7 +46,7 @@ files:
45
46
  - lib/common-crawl-index/version.rb
46
47
  - spec/basic_spec.rb
47
48
  - spec/spec_helper.rb
48
- homepage: ''
49
+ homepage: https://github.com/VerticalSet/common-crawl-index
49
50
  licenses: []
50
51
  post_install_message:
51
52
  rdoc_options: []
@@ -60,9 +61,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
60
61
  required_rubygems_version: !ruby/object:Gem::Requirement
61
62
  none: false
62
63
  requirements:
63
- - - ! '>'
64
+ - - ! '>='
64
65
  - !ruby/object:Gem::Version
65
- version: 1.3.1
66
+ version: '0'
66
67
  requirements: []
67
68
  rubyforge_project:
68
69
  rubygems_version: 1.8.24