common-crawl-index 0.0.1.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .rvmrc
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in cc-url-index.gemspec
4
+ gemspec
5
+
6
+ gem 'aws-sdk'
7
+ #gem 'open3'
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Amit Ambardekar
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # CommonCrawlIndex
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'common-crawl-index'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install common-crawl-index
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'common-crawl-index/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "common-crawl-index"
8
+ gem.version = CommonCrawlIndex::VERSION
9
+ gem.authors = ["Amit Ambardekar"]
10
+ gem.email = ["amitamb@gmail.com"]
11
+ gem.description = %q{Access coomon crawl URL index}
12
+ gem.summary = %q{Access coomon crawl URL index}
13
+ gem.homepage = ""
14
+
15
+ gem.add_development_dependency "rspec"
16
+
17
+ gem.files = `git ls-files`.split($/)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib"]
21
+ end
@@ -0,0 +1,171 @@
1
+ require "common-crawl-index/version"
2
+ require 'aws-sdk'
3
+ require 'open3'
4
+
5
+
6
+ module CommonCrawlIndex
7
+ class Client
8
+ @@settings = {
9
+ :access_key_id => nil,
10
+ :secret_access_key => nil,
11
+ :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792"
12
+ }
13
+
14
+ def self.config(settings = {})
15
+ @@settings = @@settings.merge(settings)
16
+ end
17
+
18
+ HEADER_OFFSET = 8
19
+
20
+ def initialize(access_key_id=nil, secret_access_key=nil, cc_index_path = nil)
21
+ @s3=AWS::S3.new(
22
+ :access_key_id => access_key_id || @@settings[:access_key_id],
23
+ :secret_access_key => secret_access_key || @@settings[:secret_access_key]
24
+ )
25
+
26
+ @cc_index_path = cc_index_path || @@settings[:cc_index_path]
27
+
28
+ proto,unused,@bucket_name,*rest=@cc_index_path.chomp.split File::SEPARATOR
29
+ raise ArgumentError, "#{__FILE__}: Unknown S3 Protocol #{proto}" unless proto=~/^s3/
30
+ @object_name=File.join rest
31
+
32
+ @block_size, @index_block_count = read( (0..7) ).unpack("LL")
33
+ end
34
+
35
+ def find_by_prefix(url, exact_match = false, &proc_block)
36
+ next_block = 0
37
+ while next_block < @index_block_count
38
+ next_block = get_next_block_id(url, next_block)
39
+ end
40
+ get_matching_urls_from_data_blocks(next_block, url, exact_match, &proc_block)
41
+ end
42
+
43
+ def self.normalize_url(url, append_scheme = true)
44
+ url_to_find = url
45
+ norm_url_to_find = URI(url_to_find)
46
+ norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
47
+ norm_url = norm_url_to_find.to_s
48
+ norm_url = norm_url[norm_url.index("\/\/")+2..-1]
49
+ norm_url += ":" + norm_url_to_find.scheme if append_scheme
50
+ norm_url
51
+ end
52
+
53
+ def self.denormalize_url(normalized_url, has_scheme = true)
54
+ scheme = "http"
55
+ colon_index = 0
56
+ if has_scheme
57
+ colon_index = normalized_url.rindex(":")
58
+ scheme = normalized_url[colon_index+1..-1] if colon_index
59
+ end
60
+ url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
61
+ uri = URI(url_with_scheme)
62
+ uri.host = uri.host.split(".").reverse.join(".")
63
+ uri.to_s
64
+ end
65
+
66
+ private
67
+
68
+ def s3_object
69
+ @s3_object ||= @s3.buckets[@bucket_name].objects[@object_name]
70
+ end
71
+
72
+ def get_matching_urls_from_data_blocks(start_block, url_to_find, exact_match, &proc_block)
73
+
74
+ norm_url = Client.normalize_url(url_to_find, false)
75
+ norm_url_length = norm_url.length
76
+
77
+ first_match_found = false
78
+
79
+ cur_block_index = start_block
80
+ cur_block = read_block(start_block)
81
+ cur_loc = 0
82
+
83
+ end_found = false
84
+ while(!end_found)
85
+ if cur_block[cur_loc..cur_loc] == "\x00" || cur_loc >= @block_size
86
+ # to next block
87
+ if !first_match_found || exact_match # don't search next block for exact match
88
+ return false
89
+ end
90
+ cur_block_index += 1
91
+ cur_block = read_block(cur_block_index)
92
+ cur_loc = 0
93
+ end
94
+ nil_loc = cur_block.index("\x00", cur_loc)
95
+ url = cur_block[cur_loc..nil_loc-1]
96
+ if url[0..norm_url_length-1] == norm_url
97
+ url_data = {}
98
+ url_data[:normalized_url] = url
99
+ url_data[:url] = Client.denormalize_url(url)
100
+ a,b,c,d,e = cur_block[nil_loc+1..nil_loc+32].unpack("QQLQL")
101
+ url_data[:arcSourceSegmentId] = a
102
+ url_data[:arcFileDate] = b
103
+ url_data[:arcFilePartition] = c
104
+ url_data[:arcFileOffset] = d
105
+ url_data[:compressedSize] = e
106
+ if exact_match
107
+ if url == Client.normalize_url(url_to_find, true)
108
+ proc_block.call(url_data)
109
+ break
110
+ end
111
+ else
112
+ first_match_found = true
113
+ break if proc_block.call(url_data) == false
114
+ end
115
+ else
116
+ if first_match_found
117
+ break
118
+ end
119
+ end
120
+ cur_loc = nil_loc + 32 + 1
121
+ end
122
+ end
123
+
124
+ def read(target_range)
125
+ s3_object.read( :range => target_range )
126
+ end
127
+
128
+ def read_block(block_id)
129
+ #puts "Reading block No: #{block_id}"
130
+ start = HEADER_OFFSET + @block_size * block_id
131
+ target_range = (start..start+@block_size-1)
132
+ cur_block = read(target_range)
133
+ end
134
+
135
+ # search within
136
+ def get_next_block_id(url_to_find, block_id)
137
+ norm_url = Client.normalize_url(url_to_find, false)
138
+ cur_block = read_block(block_id)
139
+
140
+ not_found = true
141
+ cur_loc = 4
142
+ last_block_num = nil
143
+
144
+ counter = 0
145
+
146
+ while not_found
147
+
148
+ counter += 1
149
+
150
+ # read from cur_loc
151
+ next_nil_loc = cur_block.index("\x00", cur_loc)
152
+
153
+ break if next_nil_loc == cur_loc + 1
154
+
155
+ cur_prefix = cur_block[cur_loc..next_nil_loc-1]
156
+ cur_block_num = cur_block[next_nil_loc+1..next_nil_loc+1+4].unpack("L")[0]
157
+
158
+ if cur_prefix >= norm_url
159
+ next_block = last_block_num || cur_block_num
160
+ return next_block
161
+ end
162
+
163
+ break if cur_loc >= @block_size
164
+
165
+ last_block_num = cur_block_num
166
+ cur_loc = next_nil_loc + 1 + 4
167
+ end
168
+
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,3 @@
1
+ module CommonCrawlIndex
2
+ VERSION = "0.0.1.alpha"
3
+ end
@@ -0,0 +1,71 @@
1
+ require 'spec_helper'
2
+ describe CommonCrawlIndex do
3
+
4
+ AMAZON_ACCESS_KEY_ID = ENV['AMAZON_ACCESS_KEY_ID']
5
+ AMAZON_SECRET_ACCESS_KEY = ENV['AMAZON_SECRET_ACCESS_KEY']
6
+
7
+ it "should config successfully" do
8
+ settings = {
9
+ :access_key_id => "access_key",
10
+ :secret_access_key => "secret_key",
11
+ :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
12
+ }
13
+
14
+ CommonCrawlIndex::Client.config(settings)
15
+
16
+ final_settings = CommonCrawlIndex::Client.class_variable_get(:@@settings)
17
+
18
+ final_settings.should == settings
19
+ end
20
+
21
+ it "should initialize client" do
22
+ client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
23
+ client.should_not == nil
24
+ end
25
+
26
+ it "should find by prefix" do
27
+ client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
28
+
29
+ total_urls_to_test = 100
30
+
31
+ url = "http://www.amazon.com/"
32
+ normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
33
+ normalized_url_length = normalized_url.length
34
+
35
+ client.find_by_prefix(url) do |url_data|
36
+ total_urls_to_test -= 1
37
+ prefix = url_data[:normalized_url][0..normalized_url_length-1]
38
+ normalized_url.should eql prefix
39
+ false if total_urls_to_test == 0
40
+ end
41
+ end
42
+
43
+ it "should match an exact url" do
44
+ client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
45
+
46
+ client.find_by_prefix("http://www.google.com/", true) do |url_data|
47
+ expected_url_data = {:normalized_url=>"com.google.www/:http", :url=>"http://www.google.com/", :arcSourceSegmentId=>1346823846039, :arcFileDate=>1346870285062, :arcFilePartition=>14, :arcFileOffset=>38347629, :compressedSize=>6198}
48
+ url_data.should eql expected_url_data
49
+ end
50
+ end
51
+
52
+ it "should normalize the urls correctly" do
53
+ normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
54
+ normalized_url.should == "com.google.www/test/path:http"
55
+ end
56
+
57
+ it "should normalize the urls correctly without scheme" do
58
+ normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path", false)
59
+ normalized_url.should == "com.google.www/test/path"
60
+ end
61
+
62
+ it "should denormalize the urls correctly" do
63
+ url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
64
+ url.should == "http://www.google.com/test/path"
65
+ end
66
+
67
+ it "should denormalize the urls correctly without scheme" do
68
+ url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path", false)
69
+ url.should == "http://www.google.com/test/path"
70
+ end
71
+ end
@@ -0,0 +1,22 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'common-crawl-index' # and any other gems you need
5
+
6
+ # This file was generated by the `rspec --init` command. Conventionally, all
7
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
8
+ # Require this file using `require "spec_helper"` to ensure that it is only
9
+ # loaded once.
10
+ #
11
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
12
+ RSpec.configure do |config|
13
+ config.treat_symbols_as_metadata_keys_with_true_values = true
14
+ config.run_all_when_everything_filtered = true
15
+ config.filter_run :focus
16
+
17
+ # Run specs in random order to surface order dependencies. If you find an
18
+ # order dependency and want to debug it, you can fix the order by providing
19
+ # the seed, which is printed after each run.
20
+ # --seed 1234
21
+ config.order = 'random'
22
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: common-crawl-index
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1.alpha
5
+ prerelease: 6
6
+ platform: ruby
7
+ authors:
8
+ - Amit Ambardekar
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: Access coomon crawl URL index
31
+ email:
32
+ - amitamb@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - .gitignore
38
+ - .rspec
39
+ - Gemfile
40
+ - LICENSE.txt
41
+ - README.md
42
+ - Rakefile
43
+ - common-crawl-index.gemspec
44
+ - lib/common-crawl-index.rb
45
+ - lib/common-crawl-index/version.rb
46
+ - spec/basic_spec.rb
47
+ - spec/spec_helper.rb
48
+ homepage: ''
49
+ licenses: []
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ! '>='
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>'
64
+ - !ruby/object:Gem::Version
65
+ version: 1.3.1
66
+ requirements: []
67
+ rubyforge_project:
68
+ rubygems_version: 1.8.24
69
+ signing_key:
70
+ specification_version: 3
71
+ summary: Access coomon crawl URL index
72
+ test_files:
73
+ - spec/basic_spec.rb
74
+ - spec/spec_helper.rb