common-crawl-index 0.0.2.alpha → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,7 @@
1
+ ## v0.0.2
2
+
3
+ * Added runtime dependancy for aws-sdk and addressable gems instead of placing them in Gemfile
4
+
1
5
  ## v0.0.1.alpha
2
6
 
3
7
  * initial release
data/Gemfile CHANGED
@@ -1,7 +1,4 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in cc-url-index.gemspec
4
- gemspec
5
-
6
- gem 'aws-sdk'
7
- #gem 'open3'
4
+ gemspec
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # CommonCrawlIndex
2
2
 
3
- TODO: Write a gem description
3
+ Use this gem to access [Common Crawl URL Index](http://commoncrawl.org/common-crawl-url-index/) with ruby. You can get more information on format and original python implementation at https://github.com/trivio/common_crawl_index
4
4
 
5
5
  ## Installation
6
6
 
@@ -18,7 +18,33 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- TODO: Write usage instructions here
21
+ When using with Rails in `config/initializers/common_crawl_index.rb`
22
+
23
+ ```ruby
24
+ CommonCrawlIndex::Client.config({
25
+ :access_key_id => "amazon aws access_key",
26
+ :secret_access_key => "amazon aws secret_key",
27
+ :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
28
+ })
29
+ ```
30
+
31
+ And to find URLs matching certain prefix use following syntax
32
+
33
+ ```ruby
34
+ client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
35
+
36
+ # or
37
+
38
+ client = CommonCrawlIndex::Client.new() # already configured
39
+
40
+ url = "http://www.amazon.com/"
41
+
42
+ client.find_by_prefix(url) do |url_data|
43
+ # get all URLs starting with http://www.amazon.com/
44
+ end
45
+ ```
46
+
47
+ See `spec/basic_spec.rb` for more examples on usage.
22
48
 
23
49
  ## Contributing
24
50
 
@@ -18,4 +18,7 @@ Gem::Specification.new do |gem|
18
18
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
19
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
20
  gem.require_paths = ["lib"]
21
+
22
+ gem.add_runtime_dependency('aws-sdk')
23
+ gem.add_runtime_dependency('addressable')
21
24
  end
@@ -1,7 +1,7 @@
1
- require "common-crawl-index/version"
1
+ require 'common-crawl-index/version'
2
2
  require 'aws-sdk'
3
3
  require 'open3'
4
-
4
+ require 'addressable/uri'
5
5
 
6
6
  module CommonCrawlIndex
7
7
  class Client
@@ -42,7 +42,7 @@ module CommonCrawlIndex
42
42
 
43
43
  def self.normalize_url(url, append_scheme = true)
44
44
  url_to_find = url
45
- norm_url_to_find = URI(url_to_find)
45
+ norm_url_to_find = Addressable::URI.parse(url_to_find)
46
46
  norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
47
47
  norm_url = norm_url_to_find.to_s
48
48
  norm_url = norm_url[norm_url.index("\/\/")+2..-1]
@@ -58,7 +58,7 @@ module CommonCrawlIndex
58
58
  scheme = normalized_url[colon_index+1..-1] if colon_index
59
59
  end
60
60
  url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
61
- uri = URI(url_with_scheme)
61
+ uri = Addressable::URI.parse(url_with_scheme)
62
62
  uri.host = uri.host.split(".").reverse.join(".")
63
63
  uri.to_s
64
64
  end
@@ -119,6 +119,7 @@ module CommonCrawlIndex
119
119
  end
120
120
  cur_loc = nil_loc + 32 + 1
121
121
  end
122
+ true
122
123
  end
123
124
 
124
125
  def read(target_range)
@@ -1,3 +1,3 @@
1
1
  module CommonCrawlIndex
2
- VERSION = "0.0.2.alpha"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -26,7 +26,7 @@ describe CommonCrawlIndex do
26
26
  it "should find by prefix" do
27
27
  client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
28
28
 
29
- total_urls_to_test = 100
29
+ total_urls_to_test = 2500
30
30
 
31
31
  url = "http://www.amazon.com/"
32
32
  normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
@@ -52,6 +52,9 @@ describe CommonCrawlIndex do
52
52
  it "should normalize the urls correctly" do
53
53
  normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
54
54
  normalized_url.should == "com.google.www/test/path:http"
55
+
56
+ normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8")
57
+ normalized_url.should == "com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http"
55
58
  end
56
59
 
57
60
  it "should normalize the urls correctly without scheme" do
@@ -62,6 +65,9 @@ describe CommonCrawlIndex do
62
65
  it "should denormalize the urls correctly" do
63
66
  url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
64
67
  url.should == "http://www.google.com/test/path"
68
+
69
+ url = CommonCrawlIndex::Client.denormalize_url("com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http")
70
+ url.should == "http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8"
65
71
  end
66
72
 
67
73
  it "should denormalize the urls correctly without scheme" do
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: common-crawl-index
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2.alpha
5
- prerelease: 6
4
+ version: 0.0.2
5
+ prerelease:
6
6
  platform: ruby
7
7
  authors:
8
8
  - Amit Ambardekar
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-23 00:00:00.000000000 Z
12
+ date: 2013-02-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -27,6 +27,38 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: aws-sdk
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: addressable
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
30
62
  description: Access coomon crawl URL index
31
63
  email:
32
64
  - amitamb@gmail.com
@@ -61,9 +93,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
61
93
  required_rubygems_version: !ruby/object:Gem::Requirement
62
94
  none: false
63
95
  requirements:
64
- - - ! '>'
96
+ - - ! '>='
65
97
  - !ruby/object:Gem::Version
66
- version: 1.3.1
98
+ version: '0'
67
99
  requirements: []
68
100
  rubyforge_project:
69
101
  rubygems_version: 1.8.24