RubyGems - common-crawl-index - Versions diffs - 0.0.1.alpha → 0.0.1 - Mend

common-crawl-index 0.0.1.alpha → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/CHANGELOG.md +3 -0
data/Gemfile +1 -1
data/README.md +28 -2
data/common-crawl-index.gemspec +1 -1
data/lib/common-crawl-index.rb +5 -4
data/lib/common-crawl-index/version.rb +1 -1
data/spec/basic_spec.rb +7 -1
metadata +7 -6

data/CHANGELOG.md ADDED

@@ -0,0 +1,3 @@
+## v0.0.1.alpha
+* initial release

data/Gemfile CHANGED

@@ -4,4 +4,4 @@ source 'https://rubygems.org'
 gemspec
 gem 'aws-sdk'
-#gem 'open3'
+gem 'addressable'

data/README.md CHANGED

@@ -1,6 +1,6 @@
 # CommonCrawlIndex
-TODO: Write a gem description
+Use this gem to access [Common Crawl URL Index](http://commoncrawl.org/common-crawl-url-index/) with ruby. You can get more information on format and original python implementation at https://github.com/trivio/common_crawl_index
 ## Installation
@@ -18,7 +18,33 @@ Or install it yourself as:
 ## Usage
-TODO: Write usage instructions here
+When using with Rails in `config/initializers/common_crawl_index.rb`
+```ruby
+CommonCrawlIndex::Client.config({
+  :access_key_id =>  "amazon aws access_key",
+  :secret_access_key => "amazon aws secret_key",
+  :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
+})
+```
+And to find URLs matching certain prefix use following syntax
+```ruby
+client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
+# or
+client = CommonCrawlIndex::Client.new() # already configured
+url = "http://www.amazon.com/"
+client.find_by_prefix(url) do |url_data|
+  # get all URLs starting with http://www.amazon.com/
+end
+```
+See `spec/basic_spec.rb` for more examples on usage.
 ## Contributing

data/common-crawl-index.gemspec CHANGED

@@ -10,7 +10,7 @@ Gem::Specification.new do |gem|
   gem.email         = ["amitamb@gmail.com"]
   gem.description   = %q{Access coomon crawl URL index}
   gem.summary       = %q{Access coomon crawl URL index}
-  gem.homepage      = ""
+  gem.homepage      = "https://github.com/VerticalSet/common-crawl-index"
   gem.add_development_dependency "rspec"

data/lib/common-crawl-index.rb CHANGED

@@ -1,7 +1,7 @@
-require "common-crawl-index/version"
+require 'common-crawl-index/version'
 require 'aws-sdk'
 require 'open3'
+require 'addressable/uri'
 module CommonCrawlIndex
   class Client
@@ -42,7 +42,7 @@ module CommonCrawlIndex
     def self.normalize_url(url, append_scheme = true)
       url_to_find = url
-      norm_url_to_find = URI(url_to_find)
+      norm_url_to_find = Addressable::URI.parse(url_to_find)
       norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
       norm_url = norm_url_to_find.to_s
       norm_url = norm_url[norm_url.index("\/\/")+2..-1]
@@ -58,7 +58,7 @@ module CommonCrawlIndex
         scheme = normalized_url[colon_index+1..-1] if colon_index
       end
       url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
-      uri = URI(url_with_scheme)
+      uri = Addressable::URI.parse(url_with_scheme)
       uri.host = uri.host.split(".").reverse.join(".")
       uri.to_s
     end
@@ -119,6 +119,7 @@ module CommonCrawlIndex
         end
         cur_loc = nil_loc + 32 + 1
       end
+      true
     end
     def read(target_range)

data/lib/common-crawl-index/version.rb CHANGED

@@ -1,3 +1,3 @@
 module CommonCrawlIndex
-  VERSION = "0.0.1.alpha"
+  VERSION = "0.0.1"
 end

data/spec/basic_spec.rb CHANGED

@@ -26,7 +26,7 @@ describe CommonCrawlIndex do
   it "should find by prefix" do
     client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
-    total_urls_to_test = 100
+    total_urls_to_test = 2500
     url = "http://www.amazon.com/"
     normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
@@ -52,6 +52,9 @@ describe CommonCrawlIndex do
   it "should normalize the urls correctly" do
     normalized_url =  CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
     normalized_url.should == "com.google.www/test/path:http"
+    normalized_url =  CommonCrawlIndex::Client.normalize_url("http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8")
+    normalized_url.should == "com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http"
   end
   it "should normalize the urls correctly without scheme" do
@@ -62,6 +65,9 @@ describe CommonCrawlIndex do
   it "should denormalize the urls correctly" do
     url =  CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
     url.should == "http://www.google.com/test/path"
+    url =  CommonCrawlIndex::Client.denormalize_url("com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http")
+    url.should == "http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8"
   end
   it "should denormalize the urls correctly without scheme" do

metadata CHANGED

@@ -1,15 +1,15 @@
 --- !ruby/object:Gem::Specification
 name: common-crawl-index
 version: !ruby/object:Gem::Version
-  version: 0.0.1.alpha
-  prerelease: 6
+  version: 0.0.1
+  prerelease:
 platform: ruby
 authors:
 - Amit Ambardekar
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-01-23 00:00:00.000000000 Z
+date: 2013-01-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -36,6 +36,7 @@ extra_rdoc_files: []
 files:
 - .gitignore
 - .rspec
+- CHANGELOG.md
 - Gemfile
 - LICENSE.txt
 - README.md
@@ -45,7 +46,7 @@ files:
 - lib/common-crawl-index/version.rb
 - spec/basic_spec.rb
 - spec/spec_helper.rb
-homepage: ''
+homepage: https://github.com/VerticalSet/common-crawl-index
 licenses: []
 post_install_message:
 rdoc_options: []
@@ -60,9 +61,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
-  - - ! '>'
+  - - ! '>='
     - !ruby/object:Gem::Version
-      version: 1.3.1
+      version: '0'
 requirements: []
 rubyforge_project:
 rubygems_version: 1.8.24