common-crawl-index 0.0.2.alpha → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,7 @@
1
+ ## v0.0.2
2
+
3
+ * Added runtime dependancy for aws-sdk and addressable gems instead of placing them in Gemfile
4
+
1
5
  ## v0.0.1.alpha
2
6
 
3
7
  * initial release
data/Gemfile CHANGED
@@ -1,7 +1,4 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in cc-url-index.gemspec
4
- gemspec
5
-
6
- gem 'aws-sdk'
7
- #gem 'open3'
4
+ gemspec
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # CommonCrawlIndex
2
2
 
3
- TODO: Write a gem description
3
+ Use this gem to access [Common Crawl URL Index](http://commoncrawl.org/common-crawl-url-index/) with ruby. You can get more information on format and original python implementation at https://github.com/trivio/common_crawl_index
4
4
 
5
5
  ## Installation
6
6
 
@@ -18,7 +18,33 @@ Or install it yourself as:
18
18
 
19
19
  ## Usage
20
20
 
21
- TODO: Write usage instructions here
21
+ When using with Rails in `config/initializers/common_crawl_index.rb`
22
+
23
+ ```ruby
24
+ CommonCrawlIndex::Client.config({
25
+ :access_key_id => "amazon aws access_key",
26
+ :secret_access_key => "amazon aws secret_key",
27
+ :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
28
+ })
29
+ ```
30
+
31
+ And to find URLs matching certain prefix use following syntax
32
+
33
+ ```ruby
34
+ client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
35
+
36
+ # or
37
+
38
+ client = CommonCrawlIndex::Client.new() # already configured
39
+
40
+ url = "http://www.amazon.com/"
41
+
42
+ client.find_by_prefix(url) do |url_data|
43
+ # get all URLs starting with http://www.amazon.com/
44
+ end
45
+ ```
46
+
47
+ See `spec/basic_spec.rb` for more examples on usage.
22
48
 
23
49
  ## Contributing
24
50
 
@@ -18,4 +18,7 @@ Gem::Specification.new do |gem|
18
18
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
19
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
20
  gem.require_paths = ["lib"]
21
+
22
+ gem.add_runtime_dependency('aws-sdk')
23
+ gem.add_runtime_dependency('addressable')
21
24
  end
@@ -1,7 +1,7 @@
1
- require "common-crawl-index/version"
1
+ require 'common-crawl-index/version'
2
2
  require 'aws-sdk'
3
3
  require 'open3'
4
-
4
+ require 'addressable/uri'
5
5
 
6
6
  module CommonCrawlIndex
7
7
  class Client
@@ -42,7 +42,7 @@ module CommonCrawlIndex
42
42
 
43
43
  def self.normalize_url(url, append_scheme = true)
44
44
  url_to_find = url
45
- norm_url_to_find = URI(url_to_find)
45
+ norm_url_to_find = Addressable::URI.parse(url_to_find)
46
46
  norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
47
47
  norm_url = norm_url_to_find.to_s
48
48
  norm_url = norm_url[norm_url.index("\/\/")+2..-1]
@@ -58,7 +58,7 @@ module CommonCrawlIndex
58
58
  scheme = normalized_url[colon_index+1..-1] if colon_index
59
59
  end
60
60
  url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
61
- uri = URI(url_with_scheme)
61
+ uri = Addressable::URI.parse(url_with_scheme)
62
62
  uri.host = uri.host.split(".").reverse.join(".")
63
63
  uri.to_s
64
64
  end
@@ -119,6 +119,7 @@ module CommonCrawlIndex
119
119
  end
120
120
  cur_loc = nil_loc + 32 + 1
121
121
  end
122
+ true
122
123
  end
123
124
 
124
125
  def read(target_range)
@@ -1,3 +1,3 @@
1
1
  module CommonCrawlIndex
2
- VERSION = "0.0.2.alpha"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -26,7 +26,7 @@ describe CommonCrawlIndex do
26
26
  it "should find by prefix" do
27
27
  client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
28
28
 
29
- total_urls_to_test = 100
29
+ total_urls_to_test = 2500
30
30
 
31
31
  url = "http://www.amazon.com/"
32
32
  normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
@@ -52,6 +52,9 @@ describe CommonCrawlIndex do
52
52
  it "should normalize the urls correctly" do
53
53
  normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
54
54
  normalized_url.should == "com.google.www/test/path:http"
55
+
56
+ normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8")
57
+ normalized_url.should == "com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http"
55
58
  end
56
59
 
57
60
  it "should normalize the urls correctly without scheme" do
@@ -62,6 +65,9 @@ describe CommonCrawlIndex do
62
65
  it "should denormalize the urls correctly" do
63
66
  url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
64
67
  url.should == "http://www.google.com/test/path"
68
+
69
+ url = CommonCrawlIndex::Client.denormalize_url("com.google.www/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8:http")
70
+ url.should == "http://www.google.com/cse?cx=009462381166450434430:-woy8fnynf8&ie=UTF-8&q=physician+Cardiology+Diagnostics+md+\"Greater+Atlanta+Area\"+-recruiter&sa=Search&siteurl=www.google.com/cse/home%3Fcx%3D009462381166450434430%253A-woy8fnynf8"
65
71
  end
66
72
 
67
73
  it "should denormalize the urls correctly without scheme" do
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: common-crawl-index
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2.alpha
5
- prerelease: 6
4
+ version: 0.0.2
5
+ prerelease:
6
6
  platform: ruby
7
7
  authors:
8
8
  - Amit Ambardekar
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-23 00:00:00.000000000 Z
12
+ date: 2013-02-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -27,6 +27,38 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: aws-sdk
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: addressable
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
30
62
  description: Access coomon crawl URL index
31
63
  email:
32
64
  - amitamb@gmail.com
@@ -61,9 +93,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
61
93
  required_rubygems_version: !ruby/object:Gem::Requirement
62
94
  none: false
63
95
  requirements:
64
- - - ! '>'
96
+ - - ! '>='
65
97
  - !ruby/object:Gem::Version
66
- version: 1.3.1
98
+ version: '0'
67
99
  requirements: []
68
100
  rubyforge_project:
69
101
  rubygems_version: 1.8.24