common-crawl-index 0.0.1 → 0.0.2.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -4,4 +4,4 @@ source 'https://rubygems.org'
4
4
  gemspec
5
5
 
6
6
  gem 'aws-sdk'
7
- gem 'addressable'
7
+ #gem 'open3'
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # CommonCrawlIndex
2
2
 
3
- Use this gem to access [Common Crawl URL Index](http://commoncrawl.org/common-crawl-url-index/) with ruby. You can get more information on format and original python implementation at https://github.com/trivio/common_crawl_index
3
+ TODO: Write a gem description
4
4
 
5
5
  ## Installation
6
6
 
@@ -18,33 +18,7 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- When using with Rails in `config/initializers/common_crawl_index.rb`
22
-
23
- ```ruby
24
- CommonCrawlIndex::Client.config({
25
- :access_key_id => "amazon aws access_key",
26
- :secret_access_key => "amazon aws secret_key",
27
- :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
28
- })
29
- ```
30
-
31
- And to find URLs matching certain prefix use following syntax
32
-
33
- ```ruby
34
- client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
35
-
36
- # or
37
-
38
- client = CommonCrawlIndex::Client.new() # already configured
39
-
40
- url = "http://www.amazon.com/"
41
-
42
- client.find_by_prefix(url) do |url_data|
43
- # get all URLs starting with http://www.amazon.com/
44
- end
45
- ```
46
-
47
- See `spec/basic_spec.rb` for more examples on usage.
21
+ TODO: Write usage instructions here
48
22
 
49
23
  ## Contributing
50
24
 
@@ -1,7 +1,7 @@
1
- require 'common-crawl-index/version'
1
+ require "common-crawl-index/version"
2
2
  require 'aws-sdk'
3
3
  require 'open3'
4
- require 'addressable/uri'
4
+
5
5
 
6
6
  module CommonCrawlIndex
7
7
  class Client
@@ -42,7 +42,7 @@ module CommonCrawlIndex
42
42
 
43
43
  def self.normalize_url(url, append_scheme = true)
44
44
  url_to_find = url
45
- norm_url_to_find = Addressable::URI.parse(url_to_find)
45
+ norm_url_to_find = URI(url_to_find)
46
46
  norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
47
47
  norm_url = norm_url_to_find.to_s
48
48
  norm_url = norm_url[norm_url.index("\/\/")+2..-1]
@@ -58,7 +58,7 @@ module CommonCrawlIndex
58
58
  scheme = normalized_url[colon_index+1..-1] if colon_index
59
59
  end
60
60
  url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
61
- uri = Addressable::URI.parse(url_with_scheme)
61
+ uri = URI(url_with_scheme)
62
62
  uri.host = uri.host.split(".").reverse.join(".")
63
63
  uri.to_s
64
64
  end
@@ -119,7 +119,6 @@ module CommonCrawlIndex
119
119
  end
120
120
  cur_loc = nil_loc + 32 + 1
121
121
  end
122
- true
123
122
  end
124
123
 
125
124
  def read(target_range)
@@ -1,3 +1,3 @@
1
1
  module CommonCrawlIndex
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2.alpha"
3
3
  end
data/spec/basic_spec.rb CHANGED
@@ -26,7 +26,7 @@ describe CommonCrawlIndex do
26
26
  it "should find by prefix" do
27
27
  client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
28
28
 
29
- total_urls_to_test = 2500
29
+ total_urls_to_test = 100
30
30
 
31
31
  url = "http://www.amazon.com/"
32
32
  normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
@@ -52,9 +52,6 @@ describe CommonCrawlIndex do
52
52
  it "should normalize the urls correctly" do
53
53
  normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
54
54
  normalized_url.should == "com.google.www/test/path:http"
55
-
56
- normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8")
57
- normalized_url.should == "com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http"
58
55
  end
59
56
 
60
57
  it "should normalize the urls correctly without scheme" do
@@ -65,9 +62,6 @@ describe CommonCrawlIndex do
65
62
  it "should denormalize the urls correctly" do
66
63
  url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
67
64
  url.should == "http://www.google.com/test/path"
68
-
69
- url = CommonCrawlIndex::Client.denormalize_url("com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http")
70
- url.should == "http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8"
71
65
  end
72
66
 
73
67
  it "should denormalize the urls correctly without scheme" do
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: common-crawl-index
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
5
- prerelease:
4
+ version: 0.0.2.alpha
5
+ prerelease: 6
6
6
  platform: ruby
7
7
  authors:
8
8
  - Amit Ambardekar
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-24 00:00:00.000000000 Z
12
+ date: 2013-01-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -61,9 +61,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
61
61
  required_rubygems_version: !ruby/object:Gem::Requirement
62
62
  none: false
63
63
  requirements:
64
- - - ! '>='
64
+ - - ! '>'
65
65
  - !ruby/object:Gem::Version
66
- version: '0'
66
+ version: 1.3.1
67
67
  requirements: []
68
68
  rubyforge_project:
69
69
  rubygems_version: 1.8.24