common-crawl-index 0.0.1.alpha

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .rvmrc
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in cc-url-index.gemspec
4
+ gemspec
5
+
6
+ gem 'aws-sdk'
7
+ #gem 'open3'
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Amit Ambardekar
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # CommonCrawlIndex
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'common-crawl-index'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install common-crawl-index
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'common-crawl-index/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "common-crawl-index"
8
+ gem.version = CommonCrawlIndex::VERSION
9
+ gem.authors = ["Amit Ambardekar"]
10
+ gem.email = ["amitamb@gmail.com"]
11
+ gem.description = %q{Access coomon crawl URL index}
12
+ gem.summary = %q{Access coomon crawl URL index}
13
+ gem.homepage = ""
14
+
15
+ gem.add_development_dependency "rspec"
16
+
17
+ gem.files = `git ls-files`.split($/)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib"]
21
+ end
@@ -0,0 +1,171 @@
1
+ require "common-crawl-index/version"
2
+ require 'aws-sdk'
3
+ require 'open3'
4
+
5
+
6
+ module CommonCrawlIndex
7
+ class Client
8
+ @@settings = {
9
+ :access_key_id => nil,
10
+ :secret_access_key => nil,
11
+ :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792"
12
+ }
13
+
14
+ def self.config(settings = {})
15
+ @@settings = @@settings.merge(settings)
16
+ end
17
+
18
+ HEADER_OFFSET = 8
19
+
20
+ def initialize(access_key_id=nil, secret_access_key=nil, cc_index_path = nil)
21
+ @s3=AWS::S3.new(
22
+ :access_key_id => access_key_id || @@settings[:access_key_id],
23
+ :secret_access_key => secret_access_key || @@settings[:secret_access_key]
24
+ )
25
+
26
+ @cc_index_path = cc_index_path || @@settings[:cc_index_path]
27
+
28
+ proto,unused,@bucket_name,*rest=@cc_index_path.chomp.split File::SEPARATOR
29
+ raise ArgumentError, "#{__FILE__}: Unknown S3 Protocol #{proto}" unless proto=~/^s3/
30
+ @object_name=File.join rest
31
+
32
+ @block_size, @index_block_count = read( (0..7) ).unpack("LL")
33
+ end
34
+
35
+ def find_by_prefix(url, exact_match = false, &proc_block)
36
+ next_block = 0
37
+ while next_block < @index_block_count
38
+ next_block = get_next_block_id(url, next_block)
39
+ end
40
+ get_matching_urls_from_data_blocks(next_block, url, exact_match, &proc_block)
41
+ end
42
+
43
+ def self.normalize_url(url, append_scheme = true)
44
+ url_to_find = url
45
+ norm_url_to_find = URI(url_to_find)
46
+ norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
47
+ norm_url = norm_url_to_find.to_s
48
+ norm_url = norm_url[norm_url.index("\/\/")+2..-1]
49
+ norm_url += ":" + norm_url_to_find.scheme if append_scheme
50
+ norm_url
51
+ end
52
+
53
+ def self.denormalize_url(normalized_url, has_scheme = true)
54
+ scheme = "http"
55
+ colon_index = 0
56
+ if has_scheme
57
+ colon_index = normalized_url.rindex(":")
58
+ scheme = normalized_url[colon_index+1..-1] if colon_index
59
+ end
60
+ url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
61
+ uri = URI(url_with_scheme)
62
+ uri.host = uri.host.split(".").reverse.join(".")
63
+ uri.to_s
64
+ end
65
+
66
+ private
67
+
68
+ def s3_object
69
+ @s3_object ||= @s3.buckets[@bucket_name].objects[@object_name]
70
+ end
71
+
72
+ def get_matching_urls_from_data_blocks(start_block, url_to_find, exact_match, &proc_block)
73
+
74
+ norm_url = Client.normalize_url(url_to_find, false)
75
+ norm_url_length = norm_url.length
76
+
77
+ first_match_found = false
78
+
79
+ cur_block_index = start_block
80
+ cur_block = read_block(start_block)
81
+ cur_loc = 0
82
+
83
+ end_found = false
84
+ while(!end_found)
85
+ if cur_block[cur_loc..cur_loc] == "\x00" || cur_loc >= @block_size
86
+ # to next block
87
+ if !first_match_found || exact_match # don't search next block for exact match
88
+ return false
89
+ end
90
+ cur_block_index += 1
91
+ cur_block = read_block(cur_block_index)
92
+ cur_loc = 0
93
+ end
94
+ nil_loc = cur_block.index("\x00", cur_loc)
95
+ url = cur_block[cur_loc..nil_loc-1]
96
+ if url[0..norm_url_length-1] == norm_url
97
+ url_data = {}
98
+ url_data[:normalized_url] = url
99
+ url_data[:url] = Client.denormalize_url(url)
100
+ a,b,c,d,e = cur_block[nil_loc+1..nil_loc+32].unpack("QQLQL")
101
+ url_data[:arcSourceSegmentId] = a
102
+ url_data[:arcFileDate] = b
103
+ url_data[:arcFilePartition] = c
104
+ url_data[:arcFileOffset] = d
105
+ url_data[:compressedSize] = e
106
+ if exact_match
107
+ if url == Client.normalize_url(url_to_find, true)
108
+ proc_block.call(url_data)
109
+ break
110
+ end
111
+ else
112
+ first_match_found = true
113
+ break if proc_block.call(url_data) == false
114
+ end
115
+ else
116
+ if first_match_found
117
+ break
118
+ end
119
+ end
120
+ cur_loc = nil_loc + 32 + 1
121
+ end
122
+ end
123
+
124
+ def read(target_range)
125
+ s3_object.read( :range => target_range )
126
+ end
127
+
128
+ def read_block(block_id)
129
+ #puts "Reading block No: #{block_id}"
130
+ start = HEADER_OFFSET + @block_size * block_id
131
+ target_range = (start..start+@block_size-1)
132
+ cur_block = read(target_range)
133
+ end
134
+
135
+ # search within
136
+ def get_next_block_id(url_to_find, block_id)
137
+ norm_url = Client.normalize_url(url_to_find, false)
138
+ cur_block = read_block(block_id)
139
+
140
+ not_found = true
141
+ cur_loc = 4
142
+ last_block_num = nil
143
+
144
+ counter = 0
145
+
146
+ while not_found
147
+
148
+ counter += 1
149
+
150
+ # read from cur_loc
151
+ next_nil_loc = cur_block.index("\x00", cur_loc)
152
+
153
+ break if next_nil_loc == cur_loc + 1
154
+
155
+ cur_prefix = cur_block[cur_loc..next_nil_loc-1]
156
+ cur_block_num = cur_block[next_nil_loc+1..next_nil_loc+1+4].unpack("L")[0]
157
+
158
+ if cur_prefix >= norm_url
159
+ next_block = last_block_num || cur_block_num
160
+ return next_block
161
+ end
162
+
163
+ break if cur_loc >= @block_size
164
+
165
+ last_block_num = cur_block_num
166
+ cur_loc = next_nil_loc + 1 + 4
167
+ end
168
+
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,3 @@
1
+ module CommonCrawlIndex
2
+ VERSION = "0.0.1.alpha"
3
+ end
@@ -0,0 +1,71 @@
1
+ require 'spec_helper'
2
+ describe CommonCrawlIndex do
3
+
4
+ AMAZON_ACCESS_KEY_ID = ENV['AMAZON_ACCESS_KEY_ID']
5
+ AMAZON_SECRET_ACCESS_KEY = ENV['AMAZON_SECRET_ACCESS_KEY']
6
+
7
+ it "should config successfully" do
8
+ settings = {
9
+ :access_key_id => "access_key",
10
+ :secret_access_key => "secret_key",
11
+ :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
12
+ }
13
+
14
+ CommonCrawlIndex::Client.config(settings)
15
+
16
+ final_settings = CommonCrawlIndex::Client.class_variable_get(:@@settings)
17
+
18
+ final_settings.should == settings
19
+ end
20
+
21
+ it "should initialize client" do
22
+ client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
23
+ client.should_not == nil
24
+ end
25
+
26
+ it "should find by prefix" do
27
+ client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
28
+
29
+ total_urls_to_test = 100
30
+
31
+ url = "http://www.amazon.com/"
32
+ normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
33
+ normalized_url_length = normalized_url.length
34
+
35
+ client.find_by_prefix(url) do |url_data|
36
+ total_urls_to_test -= 1
37
+ prefix = url_data[:normalized_url][0..normalized_url_length-1]
38
+ normalized_url.should eql prefix
39
+ false if total_urls_to_test == 0
40
+ end
41
+ end
42
+
43
+ it "should match an exact url" do
44
+ client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
45
+
46
+ client.find_by_prefix("http://www.google.com/", true) do |url_data|
47
+ expected_url_data = {:normalized_url=>"com.google.www/:http", :url=>"http://www.google.com/", :arcSourceSegmentId=>1346823846039, :arcFileDate=>1346870285062, :arcFilePartition=>14, :arcFileOffset=>38347629, :compressedSize=>6198}
48
+ url_data.should eql expected_url_data
49
+ end
50
+ end
51
+
52
+ it "should normalize the urls correctly" do
53
+ normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
54
+ normalized_url.should == "com.google.www/test/path:http"
55
+ end
56
+
57
+ it "should normalize the urls correctly without scheme" do
58
+ normalized_url = CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path", false)
59
+ normalized_url.should == "com.google.www/test/path"
60
+ end
61
+
62
+ it "should denormalize the urls correctly" do
63
+ url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
64
+ url.should == "http://www.google.com/test/path"
65
+ end
66
+
67
+ it "should denormalize the urls correctly without scheme" do
68
+ url = CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path", false)
69
+ url.should == "http://www.google.com/test/path"
70
+ end
71
+ end
@@ -0,0 +1,22 @@
1
+ require 'rubygems'
2
+ require 'bundler/setup'
3
+
4
+ require 'common-crawl-index' # and any other gems you need
5
+
6
+ # This file was generated by the `rspec --init` command. Conventionally, all
7
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
8
+ # Require this file using `require "spec_helper"` to ensure that it is only
9
+ # loaded once.
10
+ #
11
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
12
+ RSpec.configure do |config|
13
+ config.treat_symbols_as_metadata_keys_with_true_values = true
14
+ config.run_all_when_everything_filtered = true
15
+ config.filter_run :focus
16
+
17
+ # Run specs in random order to surface order dependencies. If you find an
18
+ # order dependency and want to debug it, you can fix the order by providing
19
+ # the seed, which is printed after each run.
20
+ # --seed 1234
21
+ config.order = 'random'
22
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: common-crawl-index
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1.alpha
5
+ prerelease: 6
6
+ platform: ruby
7
+ authors:
8
+ - Amit Ambardekar
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: Access coomon crawl URL index
31
+ email:
32
+ - amitamb@gmail.com
33
+ executables: []
34
+ extensions: []
35
+ extra_rdoc_files: []
36
+ files:
37
+ - .gitignore
38
+ - .rspec
39
+ - Gemfile
40
+ - LICENSE.txt
41
+ - README.md
42
+ - Rakefile
43
+ - common-crawl-index.gemspec
44
+ - lib/common-crawl-index.rb
45
+ - lib/common-crawl-index/version.rb
46
+ - spec/basic_spec.rb
47
+ - spec/spec_helper.rb
48
+ homepage: ''
49
+ licenses: []
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ! '>='
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ required_rubygems_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>'
64
+ - !ruby/object:Gem::Version
65
+ version: 1.3.1
66
+ requirements: []
67
+ rubyforge_project:
68
+ rubygems_version: 1.8.24
69
+ signing_key:
70
+ specification_version: 3
71
+ summary: Access coomon crawl URL index
72
+ test_files:
73
+ - spec/basic_spec.rb
74
+ - spec/spec_helper.rb