common-crawl-index 0.0.1.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/Gemfile +7 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/common-crawl-index.gemspec +21 -0
- data/lib/common-crawl-index.rb +171 -0
- data/lib/common-crawl-index/version.rb +3 -0
- data/spec/basic_spec.rb +71 -0
- data/spec/spec_helper.rb +22 -0
- metadata +74 -0
    
        data/.gitignore
    ADDED
    
    
    
        data/.rspec
    ADDED
    
    
    
        data/Gemfile
    ADDED
    
    
    
        data/LICENSE.txt
    ADDED
    
    | @@ -0,0 +1,22 @@ | |
| 1 | 
            +
            Copyright (c) 2013 Amit Ambardekar
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            MIT License
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            Permission is hereby granted, free of charge, to any person obtaining
         | 
| 6 | 
            +
            a copy of this software and associated documentation files (the
         | 
| 7 | 
            +
            "Software"), to deal in the Software without restriction, including
         | 
| 8 | 
            +
            without limitation the rights to use, copy, modify, merge, publish,
         | 
| 9 | 
            +
            distribute, sublicense, and/or sell copies of the Software, and to
         | 
| 10 | 
            +
            permit persons to whom the Software is furnished to do so, subject to
         | 
| 11 | 
            +
            the following conditions:
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            The above copyright notice and this permission notice shall be
         | 
| 14 | 
            +
            included in all copies or substantial portions of the Software.
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
         | 
| 17 | 
            +
            EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
         | 
| 18 | 
            +
            MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
         | 
| 19 | 
            +
            NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
         | 
| 20 | 
            +
            LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
         | 
| 21 | 
            +
            OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
         | 
| 22 | 
            +
            WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
         | 
    
        data/README.md
    ADDED
    
    | @@ -0,0 +1,29 @@ | |
| 1 | 
            +
            # CommonCrawlIndex
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            TODO: Write a gem description
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ## Installation
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            Add this line to your application's Gemfile:
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                gem 'common-crawl-index'
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            And then execute:
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                $ bundle
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            Or install it yourself as:
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                $ gem install common-crawl-index
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            ## Usage
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            TODO: Write usage instructions here
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            ## Contributing
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            1. Fork it
         | 
| 26 | 
            +
            2. Create your feature branch (`git checkout -b my-new-feature`)
         | 
| 27 | 
            +
            3. Commit your changes (`git commit -am 'Add some feature'`)
         | 
| 28 | 
            +
            4. Push to the branch (`git push origin my-new-feature`)
         | 
| 29 | 
            +
            5. Create new Pull Request
         | 
    
        data/Rakefile
    ADDED
    
    | @@ -0,0 +1 @@ | |
| 1 | 
            +
            require "bundler/gem_tasks"
         | 
| @@ -0,0 +1,21 @@ | |
| 1 | 
            +
            # -*- encoding: utf-8 -*-
         | 
| 2 | 
            +
            lib = File.expand_path('../lib', __FILE__)
         | 
| 3 | 
            +
            $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
         | 
| 4 | 
            +
            require 'common-crawl-index/version'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            Gem::Specification.new do |gem|
         | 
| 7 | 
            +
              gem.name          = "common-crawl-index"
         | 
| 8 | 
            +
              gem.version       = CommonCrawlIndex::VERSION
         | 
| 9 | 
            +
              gem.authors       = ["Amit Ambardekar"]
         | 
| 10 | 
            +
              gem.email         = ["amitamb@gmail.com"]
         | 
| 11 | 
            +
              gem.description   = %q{Access coomon crawl URL index}
         | 
| 12 | 
            +
              gem.summary       = %q{Access coomon crawl URL index}
         | 
| 13 | 
            +
              gem.homepage      = ""
         | 
| 14 | 
            +
             | 
| 15 | 
            +
              gem.add_development_dependency "rspec"
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              gem.files         = `git ls-files`.split($/)
         | 
| 18 | 
            +
              gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
         | 
| 19 | 
            +
              gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
         | 
| 20 | 
            +
              gem.require_paths = ["lib"]
         | 
| 21 | 
            +
            end
         | 
| @@ -0,0 +1,171 @@ | |
| 1 | 
            +
            require "common-crawl-index/version"
         | 
| 2 | 
            +
            require 'aws-sdk'
         | 
| 3 | 
            +
            require 'open3'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
             | 
| 6 | 
            +
            module CommonCrawlIndex
         | 
| 7 | 
            +
              class Client
         | 
| 8 | 
            +
                @@settings = {
         | 
| 9 | 
            +
                  :access_key_id => nil,
         | 
| 10 | 
            +
                  :secret_access_key => nil,
         | 
| 11 | 
            +
                  :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792"
         | 
| 12 | 
            +
                }
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                def self.config(settings = {})
         | 
| 15 | 
            +
                  @@settings = @@settings.merge(settings)
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                HEADER_OFFSET = 8
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                def initialize(access_key_id=nil, secret_access_key=nil, cc_index_path = nil)
         | 
| 21 | 
            +
                  @s3=AWS::S3.new(
         | 
| 22 | 
            +
                    :access_key_id => access_key_id || @@settings[:access_key_id],
         | 
| 23 | 
            +
                    :secret_access_key => secret_access_key || @@settings[:secret_access_key]
         | 
| 24 | 
            +
                  )
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                  @cc_index_path = cc_index_path || @@settings[:cc_index_path]
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                  proto,unused,@bucket_name,*rest=@cc_index_path.chomp.split File::SEPARATOR
         | 
| 29 | 
            +
                  raise ArgumentError, "#{__FILE__}: Unknown S3 Protocol #{proto}" unless proto=~/^s3/
         | 
| 30 | 
            +
                  @object_name=File.join rest
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  @block_size, @index_block_count = read( (0..7) ).unpack("LL")
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                def find_by_prefix(url, exact_match = false, &proc_block)
         | 
| 36 | 
            +
                  next_block = 0
         | 
| 37 | 
            +
                  while next_block < @index_block_count
         | 
| 38 | 
            +
                    next_block = get_next_block_id(url, next_block)
         | 
| 39 | 
            +
                  end
         | 
| 40 | 
            +
                  get_matching_urls_from_data_blocks(next_block, url, exact_match, &proc_block)
         | 
| 41 | 
            +
                end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                def self.normalize_url(url, append_scheme = true)
         | 
| 44 | 
            +
                  url_to_find = url
         | 
| 45 | 
            +
                  norm_url_to_find = URI(url_to_find)
         | 
| 46 | 
            +
                  norm_url_to_find.host = norm_url_to_find.host.split(".").reverse.join(".")
         | 
| 47 | 
            +
                  norm_url = norm_url_to_find.to_s
         | 
| 48 | 
            +
                  norm_url = norm_url[norm_url.index("\/\/")+2..-1]
         | 
| 49 | 
            +
                  norm_url += ":" + norm_url_to_find.scheme if append_scheme
         | 
| 50 | 
            +
                  norm_url
         | 
| 51 | 
            +
                end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                def self.denormalize_url(normalized_url, has_scheme = true)
         | 
| 54 | 
            +
                  scheme = "http"
         | 
| 55 | 
            +
                  colon_index = 0
         | 
| 56 | 
            +
                  if has_scheme
         | 
| 57 | 
            +
                    colon_index = normalized_url.rindex(":")
         | 
| 58 | 
            +
                    scheme = normalized_url[colon_index+1..-1] if colon_index
         | 
| 59 | 
            +
                  end
         | 
| 60 | 
            +
                  url_with_scheme = scheme + "://" + normalized_url[0..colon_index-1]
         | 
| 61 | 
            +
                  uri = URI(url_with_scheme)
         | 
| 62 | 
            +
                  uri.host = uri.host.split(".").reverse.join(".")
         | 
| 63 | 
            +
                  uri.to_s
         | 
| 64 | 
            +
                end
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                private
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                def s3_object
         | 
| 69 | 
            +
                  @s3_object ||= @s3.buckets[@bucket_name].objects[@object_name]
         | 
| 70 | 
            +
                end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                def get_matching_urls_from_data_blocks(start_block, url_to_find, exact_match, &proc_block)
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                  norm_url = Client.normalize_url(url_to_find, false)
         | 
| 75 | 
            +
                  norm_url_length = norm_url.length
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                  first_match_found = false
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                  cur_block_index = start_block
         | 
| 80 | 
            +
                  cur_block = read_block(start_block)
         | 
| 81 | 
            +
                  cur_loc = 0
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                  end_found = false
         | 
| 84 | 
            +
                  while(!end_found)
         | 
| 85 | 
            +
                    if cur_block[cur_loc..cur_loc] == "\x00" || cur_loc >= @block_size
         | 
| 86 | 
            +
                      # to next block
         | 
| 87 | 
            +
                      if !first_match_found || exact_match # don't search next block for exact match
         | 
| 88 | 
            +
                        return false
         | 
| 89 | 
            +
                      end
         | 
| 90 | 
            +
                      cur_block_index += 1
         | 
| 91 | 
            +
                      cur_block = read_block(cur_block_index)
         | 
| 92 | 
            +
                      cur_loc = 0
         | 
| 93 | 
            +
                    end
         | 
| 94 | 
            +
                    nil_loc = cur_block.index("\x00", cur_loc)
         | 
| 95 | 
            +
                    url = cur_block[cur_loc..nil_loc-1]
         | 
| 96 | 
            +
                    if url[0..norm_url_length-1] == norm_url
         | 
| 97 | 
            +
                      url_data = {}
         | 
| 98 | 
            +
                      url_data[:normalized_url] = url
         | 
| 99 | 
            +
                      url_data[:url] = Client.denormalize_url(url)
         | 
| 100 | 
            +
                      a,b,c,d,e = cur_block[nil_loc+1..nil_loc+32].unpack("QQLQL")
         | 
| 101 | 
            +
                      url_data[:arcSourceSegmentId] = a
         | 
| 102 | 
            +
                      url_data[:arcFileDate] = b
         | 
| 103 | 
            +
                      url_data[:arcFilePartition] = c
         | 
| 104 | 
            +
                      url_data[:arcFileOffset] = d
         | 
| 105 | 
            +
                      url_data[:compressedSize] = e
         | 
| 106 | 
            +
                      if exact_match
         | 
| 107 | 
            +
                        if url == Client.normalize_url(url_to_find, true)
         | 
| 108 | 
            +
                          proc_block.call(url_data)
         | 
| 109 | 
            +
                          break
         | 
| 110 | 
            +
                        end
         | 
| 111 | 
            +
                      else
         | 
| 112 | 
            +
                        first_match_found = true
         | 
| 113 | 
            +
                        break if proc_block.call(url_data) == false
         | 
| 114 | 
            +
                      end
         | 
| 115 | 
            +
                    else
         | 
| 116 | 
            +
                      if first_match_found
         | 
| 117 | 
            +
                        break
         | 
| 118 | 
            +
                      end
         | 
| 119 | 
            +
                    end
         | 
| 120 | 
            +
                    cur_loc = nil_loc + 32 + 1
         | 
| 121 | 
            +
                  end
         | 
| 122 | 
            +
                end
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                def read(target_range)
         | 
| 125 | 
            +
                  s3_object.read( :range => target_range )
         | 
| 126 | 
            +
                end
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                def read_block(block_id)
         | 
| 129 | 
            +
                  #puts "Reading block No: #{block_id}"
         | 
| 130 | 
            +
                  start = HEADER_OFFSET + @block_size * block_id
         | 
| 131 | 
            +
                  target_range = (start..start+@block_size-1)
         | 
| 132 | 
            +
                  cur_block = read(target_range)
         | 
| 133 | 
            +
                end
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                # search within
         | 
| 136 | 
            +
                def get_next_block_id(url_to_find, block_id)
         | 
| 137 | 
            +
                  norm_url = Client.normalize_url(url_to_find, false)
         | 
| 138 | 
            +
                  cur_block = read_block(block_id)
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                  not_found = true
         | 
| 141 | 
            +
                  cur_loc = 4
         | 
| 142 | 
            +
                  last_block_num = nil
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                  counter = 0
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                  while not_found
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                    counter += 1
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                    # read from cur_loc
         | 
| 151 | 
            +
                    next_nil_loc = cur_block.index("\x00", cur_loc)
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                    break if next_nil_loc == cur_loc + 1
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                    cur_prefix = cur_block[cur_loc..next_nil_loc-1]
         | 
| 156 | 
            +
                    cur_block_num = cur_block[next_nil_loc+1..next_nil_loc+1+4].unpack("L")[0]
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    if cur_prefix >= norm_url
         | 
| 159 | 
            +
                      next_block = last_block_num || cur_block_num
         | 
| 160 | 
            +
                      return next_block
         | 
| 161 | 
            +
                    end
         | 
| 162 | 
            +
             | 
| 163 | 
            +
                    break if cur_loc >= @block_size
         | 
| 164 | 
            +
             | 
| 165 | 
            +
                    last_block_num = cur_block_num
         | 
| 166 | 
            +
                    cur_loc = next_nil_loc + 1 + 4
         | 
| 167 | 
            +
                  end
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                end
         | 
| 170 | 
            +
              end
         | 
| 171 | 
            +
            end
         | 
    
        data/spec/basic_spec.rb
    ADDED
    
    | @@ -0,0 +1,71 @@ | |
| 1 | 
            +
            require 'spec_helper'
         | 
| 2 | 
            +
            describe CommonCrawlIndex do
         | 
| 3 | 
            +
             | 
| 4 | 
            +
              AMAZON_ACCESS_KEY_ID = ENV['AMAZON_ACCESS_KEY_ID']
         | 
| 5 | 
            +
              AMAZON_SECRET_ACCESS_KEY = ENV['AMAZON_SECRET_ACCESS_KEY']
         | 
| 6 | 
            +
             | 
| 7 | 
            +
              it "should config successfully" do
         | 
| 8 | 
            +
                settings = {
         | 
| 9 | 
            +
                  :access_key_id => "access_key",
         | 
| 10 | 
            +
                  :secret_access_key => "secret_key",
         | 
| 11 | 
            +
                  :cc_index_path => "s3://aws-publicdatasets/common-crawl/projects/url-index/url-index.1356128792" # optional
         | 
| 12 | 
            +
                }
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                CommonCrawlIndex::Client.config(settings)
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                final_settings = CommonCrawlIndex::Client.class_variable_get(:@@settings)
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                final_settings.should == settings
         | 
| 19 | 
            +
              end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
              it "should initialize client" do
         | 
| 22 | 
            +
                client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
         | 
| 23 | 
            +
                client.should_not == nil
         | 
| 24 | 
            +
              end
         | 
| 25 | 
            +
             | 
| 26 | 
            +
              it "should find by prefix" do
         | 
| 27 | 
            +
                client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                total_urls_to_test = 100
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                url = "http://www.amazon.com/"
         | 
| 32 | 
            +
                normalized_url = CommonCrawlIndex::Client.normalize_url(url, false)
         | 
| 33 | 
            +
                normalized_url_length = normalized_url.length
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                client.find_by_prefix(url) do |url_data|
         | 
| 36 | 
            +
                  total_urls_to_test -= 1
         | 
| 37 | 
            +
                  prefix = url_data[:normalized_url][0..normalized_url_length-1]
         | 
| 38 | 
            +
                  normalized_url.should eql prefix
         | 
| 39 | 
            +
                  false if total_urls_to_test == 0
         | 
| 40 | 
            +
                end
         | 
| 41 | 
            +
              end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
              it "should match an exact url" do
         | 
| 44 | 
            +
                client = CommonCrawlIndex::Client.new(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_ACCESS_KEY)
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                client.find_by_prefix("http://www.google.com/", true) do |url_data|
         | 
| 47 | 
            +
                  expected_url_data = {:normalized_url=>"com.google.www/:http", :url=>"http://www.google.com/", :arcSourceSegmentId=>1346823846039, :arcFileDate=>1346870285062, :arcFilePartition=>14, :arcFileOffset=>38347629, :compressedSize=>6198}
         | 
| 48 | 
            +
                  url_data.should eql expected_url_data
         | 
| 49 | 
            +
                end
         | 
| 50 | 
            +
              end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
              it "should normalize the urls correctly" do
         | 
| 53 | 
            +
                normalized_url =  CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path")
         | 
| 54 | 
            +
                normalized_url.should == "com.google.www/test/path:http"
         | 
| 55 | 
            +
              end
         | 
| 56 | 
            +
             | 
| 57 | 
            +
              it "should normalize the urls correctly without scheme" do
         | 
| 58 | 
            +
                normalized_url =  CommonCrawlIndex::Client.normalize_url("http://www.google.com/test/path", false)
         | 
| 59 | 
            +
                normalized_url.should == "com.google.www/test/path"
         | 
| 60 | 
            +
              end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
              it "should denormalize the urls correctly" do
         | 
| 63 | 
            +
                url =  CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path:http")
         | 
| 64 | 
            +
                url.should == "http://www.google.com/test/path"
         | 
| 65 | 
            +
              end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
              it "should denormalize the urls correctly without scheme" do
         | 
| 68 | 
            +
                url =  CommonCrawlIndex::Client.denormalize_url("com.google.www/test/path", false)
         | 
| 69 | 
            +
                url.should == "http://www.google.com/test/path"
         | 
| 70 | 
            +
              end
         | 
| 71 | 
            +
            end
         | 
    
        data/spec/spec_helper.rb
    ADDED
    
    | @@ -0,0 +1,22 @@ | |
| 1 | 
            +
            require 'rubygems'
         | 
| 2 | 
            +
            require 'bundler/setup'
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            require 'common-crawl-index' # and any other gems you need
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            # This file was generated by the `rspec --init` command. Conventionally, all
         | 
| 7 | 
            +
            # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
         | 
| 8 | 
            +
            # Require this file using `require "spec_helper"` to ensure that it is only
         | 
| 9 | 
            +
            # loaded once.
         | 
| 10 | 
            +
            #
         | 
| 11 | 
            +
            # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
         | 
| 12 | 
            +
            RSpec.configure do |config|
         | 
| 13 | 
            +
              config.treat_symbols_as_metadata_keys_with_true_values = true
         | 
| 14 | 
            +
              config.run_all_when_everything_filtered = true
         | 
| 15 | 
            +
              config.filter_run :focus
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              # Run specs in random order to surface order dependencies. If you find an
         | 
| 18 | 
            +
              # order dependency and want to debug it, you can fix the order by providing
         | 
| 19 | 
            +
              # the seed, which is printed after each run.
         | 
| 20 | 
            +
              #     --seed 1234
         | 
| 21 | 
            +
              config.order = 'random'
         | 
| 22 | 
            +
            end
         | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,74 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification
         | 
| 2 | 
            +
            name: common-crawl-index
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            +
              version: 0.0.1.alpha
         | 
| 5 | 
            +
              prerelease: 6
         | 
| 6 | 
            +
            platform: ruby
         | 
| 7 | 
            +
            authors:
         | 
| 8 | 
            +
            - Amit Ambardekar
         | 
| 9 | 
            +
            autorequire: 
         | 
| 10 | 
            +
            bindir: bin
         | 
| 11 | 
            +
            cert_chain: []
         | 
| 12 | 
            +
            date: 2013-01-23 00:00:00.000000000 Z
         | 
| 13 | 
            +
            dependencies:
         | 
| 14 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 15 | 
            +
              name: rspec
         | 
| 16 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 17 | 
            +
                none: false
         | 
| 18 | 
            +
                requirements:
         | 
| 19 | 
            +
                - - ! '>='
         | 
| 20 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 21 | 
            +
                    version: '0'
         | 
| 22 | 
            +
              type: :development
         | 
| 23 | 
            +
              prerelease: false
         | 
| 24 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 25 | 
            +
                none: false
         | 
| 26 | 
            +
                requirements:
         | 
| 27 | 
            +
                - - ! '>='
         | 
| 28 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 29 | 
            +
                    version: '0'
         | 
| 30 | 
            +
            description: Access coomon crawl URL index
         | 
| 31 | 
            +
            email:
         | 
| 32 | 
            +
            - amitamb@gmail.com
         | 
| 33 | 
            +
            executables: []
         | 
| 34 | 
            +
            extensions: []
         | 
| 35 | 
            +
            extra_rdoc_files: []
         | 
| 36 | 
            +
            files:
         | 
| 37 | 
            +
            - .gitignore
         | 
| 38 | 
            +
            - .rspec
         | 
| 39 | 
            +
            - Gemfile
         | 
| 40 | 
            +
            - LICENSE.txt
         | 
| 41 | 
            +
            - README.md
         | 
| 42 | 
            +
            - Rakefile
         | 
| 43 | 
            +
            - common-crawl-index.gemspec
         | 
| 44 | 
            +
            - lib/common-crawl-index.rb
         | 
| 45 | 
            +
            - lib/common-crawl-index/version.rb
         | 
| 46 | 
            +
            - spec/basic_spec.rb
         | 
| 47 | 
            +
            - spec/spec_helper.rb
         | 
| 48 | 
            +
            homepage: ''
         | 
| 49 | 
            +
            licenses: []
         | 
| 50 | 
            +
            post_install_message: 
         | 
| 51 | 
            +
            rdoc_options: []
         | 
| 52 | 
            +
            require_paths:
         | 
| 53 | 
            +
            - lib
         | 
| 54 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 55 | 
            +
              none: false
         | 
| 56 | 
            +
              requirements:
         | 
| 57 | 
            +
              - - ! '>='
         | 
| 58 | 
            +
                - !ruby/object:Gem::Version
         | 
| 59 | 
            +
                  version: '0'
         | 
| 60 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 61 | 
            +
              none: false
         | 
| 62 | 
            +
              requirements:
         | 
| 63 | 
            +
              - - ! '>'
         | 
| 64 | 
            +
                - !ruby/object:Gem::Version
         | 
| 65 | 
            +
                  version: 1.3.1
         | 
| 66 | 
            +
            requirements: []
         | 
| 67 | 
            +
            rubyforge_project: 
         | 
| 68 | 
            +
            rubygems_version: 1.8.24
         | 
| 69 | 
            +
            signing_key: 
         | 
| 70 | 
            +
            specification_version: 3
         | 
| 71 | 
            +
            summary: Access coomon crawl URL index
         | 
| 72 | 
            +
            test_files:
         | 
| 73 | 
            +
            - spec/basic_spec.rb
         | 
| 74 | 
            +
            - spec/spec_helper.rb
         |