RubyGems - common-crawl-index - Versions diffs - 0.0.1 → 0.0.2.alpha - Mend

common-crawl-index 0.0.1 → 0.0.2.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/Gemfile +1 -1
data/README.md +2 -28
data/lib/common-crawl-index.rb +4 -5
data/lib/common-crawl-index/version.rb +1 -1
data/spec/basic_spec.rb +1 -7
metadata +5 -5

data/Gemfile CHANGED Viewed

@@ -4,4 +4,4 @@ source 'https://rubygems.org'
 gemspec
 gem 'aws-sdk'
-gem 'addressable'
+#gem 'open3'

data/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # CommonCrawlIndex
-Use this gem to access [Common Crawl URL Index](http://commoncrawl.org/common-crawl-url-index/) with ruby. You can get more information on format and original python implementation at https://github.com/trivio/common_crawl_index
+TODO: Write a gem description
 ## Installation
@@ -18,33 +18,7 @@ Or install it yourself as:
 ## Usage
-When using with Rails in `config/initializers/common_crawl_index.rb`
-```ruby
-CommonCrawlIndex::Client.config({
-  :access_key_id =>  "amazon aws access_key",
-  :secret_access_key => "amazon aws secret_key",
-  :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
-})
-```
-And to find URLs matching certain prefix use following syntax
-```ruby
-client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
-# or
-client = CommonCrawlIndex::Client.new() # already configured
-url = "http://www.amazon.com/"
-client.find_by_prefix(url) do |url_data|
-  # get all URLs starting with http://www.amazon.com/
-end
-```
-See `spec/basic_spec.rb` for more examples on usage.
+TODO: Write usage instructions here
 ## Contributing

data/lib/common-crawl-index.rb CHANGED Viewed

@@ -1,7 +1,7 @@
-require 'common-crawl-index/version'
+require "common-crawl-index/version"
 require 'aws-sdk'
 require 'open3'
-require 'addressable/uri'
 module CommonCrawlIndex
   class Client
@@ -42,7 +42,7 @@ module CommonCrawlIndex
     def self.normalize_url(url, append_scheme = true)
       url_to_find = url
-      norm_url_to_find = Addressable::URI.parse(url_to_find)
+      norm_url_to_find = URI(url_to_find)
       norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
       norm_url = norm_url_to_find.to_s
       norm_url = norm_url[norm_url.index("\/\/")+2..-1]
@@ -58,7 +58,7 @@ module CommonCrawlIndex
         scheme = normalized_url[colon_index+1..-1] if colon_index
       end
       url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
-      uri = Addressable::URI.parse(url_with_scheme)
+      uri = URI(url_with_scheme)
       uri.host = uri.host.split(".").reverse.join(".")
       uri.to_s
     end
@@ -119,7 +119,6 @@ module CommonCrawlIndex
         end
         cur_loc = nil_loc + 32 + 1
       end
-      true
     end
     def read(target_range)

data/lib/common-crawl-index/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CommonCrawlIndex
-  VERSION = "0.0.1"
+  VERSION = "0.0.2.alpha"
 end

data/spec/basic_spec.rb CHANGED Viewed

@@ -26,7 +26,7 @@ describe CommonCrawlIndex do
   it "should find by prefix" do
     client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
-    total_urls_to_test = 2500
+    total_urls_to_test = 100
     url = "http://www.amazon.com/"
     normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
@@ -52,9 +52,6 @@ describe CommonCrawlIndex do
   it "should normalize the urls correctly" do
     normalized_url =  CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
     normalized_url.should == "com.google.www/test/path:http"
-    normalized_url =  CommonCrawlIndex::Client.normalize_url("http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8")
-    normalized_url.should == "com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http"
   end
   it "should normalize the urls correctly without scheme" do
@@ -65,9 +62,6 @@ describe CommonCrawlIndex do
   it "should denormalize the urls correctly" do
     url =  CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
     url.should == "http://www.google.com/test/path"
-    url =  CommonCrawlIndex::Client.denormalize_url("com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http")
-    url.should == "http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8"
   end
   it "should denormalize the urls correctly without scheme" do

metadata CHANGED Viewed

@@ -1,15 +1,15 @@
 --- !ruby/object:Gem::Specification
 name: common-crawl-index
 version: !ruby/object:Gem::Version
-  version: 0.0.1
-  prerelease:
+  version: 0.0.2.alpha
+  prerelease: 6
 platform: ruby
 authors:
 - Amit Ambardekar
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-01-24 00:00:00.000000000 Z
+date: 2013-01-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -61,9 +61,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
-  - - ! '>='
+  - - ! '>'
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 1.3.1
 requirements: []
 rubyforge_project:
 rubygems_version: 1.8.24