common-crawl-index 0.0.1 → 0.0.2.alpha

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -4,4 +4,4 @@ source 'https://rubygems.org'
4
4
  gemspec
5
5
 
6
6
  gem 'aws-sdk'
7
- gem 'addressable'
7
+ #gem 'open3'
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # CommonCrawlIndex
2
2
 
3
- Use this gem to access [Common Crawl URL Index](http://commoncrawl.org/common-crawl-url-index/) with ruby. You can get more information on format and original python implementation at https://github.com/trivio/common_crawl_index
3
+ TODO: Write a gem description
4
4
 
5
5
  ## Installation
6
6
 
@@ -18,33 +18,7 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- When using with Rails in `config/initializers/common_crawl_index.rb`
22
-
23
- ```ruby
24
- CommonCrawlIndex::Client.config({
25
- :access_key_id => "amazon aws access_key",
26
- :secret_access_key => "amazon aws secret_key",
27
- :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
28
- })
29
- ```
30
-
31
- And to find URLs matching certain prefix use following syntax
32
-
33
- ```ruby
34
- client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
35
-
36
- # or
37
-
38
- client = CommonCrawlIndex::Client.new() # already configured
39
-
40
- url = "http://www.amazon.com/"
41
-
42
- client.find_by_prefix(url) do |url_data|
43
- # get all URLs starting with http://www.amazon.com/
44
- end
45
- ```
46
-
47
- See `spec/basic_spec.rb` for more examples on usage.
21
+ TODO: Write usage instructions here
48
22
 
49
23
  ## Contributing
50
24
 
@@ -1,7 +1,7 @@
1
- require 'common-crawl-index/version'
1
+ require "common-crawl-index/version"
2
2
  require 'aws-sdk'
3
3
  require 'open3'
4
- require 'addressable/uri'
4
+
5
5
 
6
6
  module CommonCrawlIndex
7
7
  class Client
@@ -42,7 +42,7 @@ module CommonCrawlIndex
42
42
 
43
43
  def self.normalize_url(url, append_scheme = true)
44
44
  url_to_find = url
45
- norm_url_to_find = Addressable::URI.parse(url_to_find)
45
+ norm_url_to_find = URI(url_to_find)
46
46
  norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
47
47
  norm_url = norm_url_to_find.to_s
48
48
  norm_url = norm_url[norm_url.index("\/\/")+2..-1]
@@ -58,7 +58,7 @@ module CommonCrawlIndex
58
58
  scheme = normalized_url[colon_index+1..-1] if colon_index
59
59
  end
60
60
  url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
61
- uri = Addressable::URI.parse(url_with_scheme)
61
+ uri = URI(url_with_scheme)
62
62
  uri.host = uri.host.split(".").reverse.join(".")
63
63
  uri.to_s
64
64
  end
@@ -119,7 +119,6 @@ module CommonCrawlIndex
119
119
  end
120
120
  cur_loc = nil_loc + 32 + 1
121
121
  end
122
- true
123
122
  end
124
123
 
125
124
  def read(target_range)
@@ -1,3 +1,3 @@
1
1
  module CommonCrawlIndex
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2.alpha"
3
3
  end
data/spec/basic_spec.rb CHANGED
@@ -26,7 +26,7 @@ describe CommonCrawlIndex do
26
26
  it "should find by prefix" do
27
27
  client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
28
28
 
29
- total_urls_to_test = 2500
29
+ total_urls_to_test = 100
30
30
 
31
31
  url = "http://www.amazon.com/"
32
32
  normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
@@ -52,9 +52,6 @@ describe CommonCrawlIndex do
52
52
  it "should normalize the urls correctly" do
53
53
  normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
54
54
  normalized_url.should == "com.google.www/test/path:http"
55
-
56
- normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8")
57
- normalized_url.should == "com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http"
58
55
  end
59
56
 
60
57
  it "should normalize the urls correctly without scheme" do
@@ -65,9 +62,6 @@ describe CommonCrawlIndex do
65
62
  it "should denormalize the urls correctly" do
66
63
  url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
67
64
  url.should == "http://www.google.com/test/path"
68
-
69
- url = CommonCrawlIndex::Client.denormalize_url("com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http")
70
- url.should == "http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8"
71
65
  end
72
66
 
73
67
  it "should denormalize the urls correctly without scheme" do
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: common-crawl-index
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
5
- prerelease:
4
+ version: 0.0.2.alpha
5
+ prerelease: 6
6
6
  platform: ruby
7
7
  authors:
8
8
  - Amit Ambardekar
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-24 00:00:00.000000000 Z
12
+ date: 2013-01-23 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -61,9 +61,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
61
61
  required_rubygems_version: !ruby/object:Gem::Requirement
62
62
  none: false
63
63
  requirements:
64
- - - ! '>='
64
+ - - ! '>'
65
65
  - !ruby/object:Gem::Version
66
- version: '0'
66
+ version: 1.3.1
67
67
  requirements: []
68
68
  rubyforge_project:
69
69
  rubygems_version: 1.8.24