crawl_kit 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/Rakefile +13 -0
- data/crawl_kit.gemspec +16 -0
- data/lib/crawl_kit/configuration.rb +81 -0
- data/lib/crawl_kit/record/helpers.rb +37 -0
- data/lib/crawl_kit/record.rb +69 -0
- data/lib/crawl_kit/s3.rb +58 -0
- data/lib/crawl_kit/simple_db.rb +46 -0
- data/lib/crawl_kit/sqs.rb +40 -0
- data/lib/crawl_kit.rb +24 -0
- data/lib/extensions/aws/sqs/queue.rb +25 -0
- metadata +67 -0
data/.gitignore
ADDED
data/Rakefile
ADDED
data/crawl_kit.gemspec
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'crawl_kit'
|
3
|
+
s.version = '0.0.1'
|
4
|
+
s.date = '2011-12-08'
|
5
|
+
s.summary = "A collection of serivce interfaces and models to use with the GoTime crawling infrastructure."
|
6
|
+
s.author = "Austin Cargol"
|
7
|
+
s.email = 'acargol@gotime.com'
|
8
|
+
|
9
|
+
s.files = `git ls-files`.split("\n")
|
10
|
+
s.test_files = `git ls-files -- {test}/*`.split("\n")
|
11
|
+
s.require_paths = %w(lib)
|
12
|
+
|
13
|
+
s.required_ruby_version = '>= 1.9.2'
|
14
|
+
|
15
|
+
s.add_dependency('aws-sdk', '>= 1.2.3')
|
16
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module CrawlKit
|
5
|
+
class Configuration
|
6
|
+
|
7
|
+
# Creates a new Configuration object.
|
8
|
+
def initialize(options = {})
|
9
|
+
options.each_pair do |opt_name, value|
|
10
|
+
opt_name = opt_name.to_sym
|
11
|
+
if self.class.accepted_options.include?(opt_name)
|
12
|
+
supplied[opt_name] = value
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Used to create a new Configuration object with the given modifications.
|
18
|
+
# The current configuration object is not modified.
|
19
|
+
def with(options = {})
|
20
|
+
# symbolize option keys
|
21
|
+
options = options.inject({}) {|h,kv| h[kv.first.to_sym] = kv.last; h }
|
22
|
+
|
23
|
+
values = supplied.merge(options)
|
24
|
+
|
25
|
+
if supplied == values
|
26
|
+
self # nothing changed
|
27
|
+
else
|
28
|
+
self.class.new(values)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# @return [Hash] Returns a hash of all configuration values.
|
33
|
+
def to_h
|
34
|
+
self.class.accepted_options.inject({}) do |h,k|
|
35
|
+
h[k] = send(k)
|
36
|
+
h
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# @return [Boolean] Returns true if the two configuration objects have
|
41
|
+
# the same values.
|
42
|
+
def == other
|
43
|
+
other.is_a?(self.class) and self.supplied == other.supplied
|
44
|
+
end
|
45
|
+
|
46
|
+
alias_method :eql, :==
|
47
|
+
|
48
|
+
# @private
|
49
|
+
def inspect
|
50
|
+
"<#{self.class.name}>"
|
51
|
+
end
|
52
|
+
|
53
|
+
protected
|
54
|
+
def supplied
|
55
|
+
@supplied ||= {}
|
56
|
+
end
|
57
|
+
|
58
|
+
class << self
|
59
|
+
|
60
|
+
# @private
|
61
|
+
def accepted_options
|
62
|
+
@options ||= Set.new
|
63
|
+
end
|
64
|
+
|
65
|
+
# @private
|
66
|
+
def add_option(name, default_value = nil, options = {}, &transform)
|
67
|
+
accepted_options << name
|
68
|
+
|
69
|
+
define_method(name) do
|
70
|
+
value = supplied.has_key?(name) ? supplied[name] : default_value
|
71
|
+
transform ? transform.call(value) : value
|
72
|
+
end
|
73
|
+
|
74
|
+
alias_method("#{name}?", name) if options[:boolean]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
add_option :crawl_domain, ENV['CRAWL_DOMAIN']
|
79
|
+
add_option :crawl_bucket, ENV['CRAWL_BUCKET']
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module CrawlKit
|
2
|
+
class Record < AWS::Record::Base
|
3
|
+
module Helpers
|
4
|
+
|
5
|
+
def self.included(base)
|
6
|
+
base.extend(ClassMethods)
|
7
|
+
end
|
8
|
+
|
9
|
+
module ClassMethods
|
10
|
+
def digest
|
11
|
+
@digest ||= OpenSSL::Digest::Digest.new('sha1')
|
12
|
+
end
|
13
|
+
|
14
|
+
def hash_uri(uri=nil)
|
15
|
+
digest.hexdigest(uri)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def hashed_uri
|
20
|
+
self.class.digest.hexdigest(uri)
|
21
|
+
end
|
22
|
+
alias :crawl_id :hashed_uri
|
23
|
+
|
24
|
+
def fetched?
|
25
|
+
CrawlKit::S3.file_exists?(crawl_id)
|
26
|
+
end
|
27
|
+
|
28
|
+
def page
|
29
|
+
CrawlKit::S3.get_file(crawl_id)
|
30
|
+
end
|
31
|
+
|
32
|
+
def previous_version
|
33
|
+
CrawlKit::S3.get_previous_version(crawl_id)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module CrawlKit
|
2
|
+
class Record < AWS::Record::Base
|
3
|
+
include CrawlKit::Record::Helpers
|
4
|
+
|
5
|
+
# State Fields
|
6
|
+
string_attr :uri
|
7
|
+
string_attr :lifecycle, default_value: "new"
|
8
|
+
string_attr :page_type
|
9
|
+
string_attr :referer
|
10
|
+
integer_attr :site_id
|
11
|
+
integer_attr :market_id
|
12
|
+
integer_attr :crawl_node_id
|
13
|
+
integer_attr :external_region_node_id
|
14
|
+
integer_attr :external_category_node_id
|
15
|
+
|
16
|
+
validates_presence_of :uri
|
17
|
+
|
18
|
+
# Crawl Fields
|
19
|
+
string_attr :domain
|
20
|
+
string_attr :status
|
21
|
+
string_attr :status_code
|
22
|
+
string_attr :last_effective_url
|
23
|
+
string_attr :location
|
24
|
+
string_attr :x_city_id
|
25
|
+
string_attr :last_changed
|
26
|
+
string_attr :last_visited
|
27
|
+
string_attr :last_modified
|
28
|
+
string_attr :content_hash
|
29
|
+
string_attr :content_length
|
30
|
+
string_attr :content_type
|
31
|
+
string_attr :etag
|
32
|
+
string_attr :x_cvs_id
|
33
|
+
string_attr :x_category
|
34
|
+
|
35
|
+
class << self
|
36
|
+
def find(id)
|
37
|
+
id =~ /^[^\.]+$/ ? super(id) : super(hash_uri(id))
|
38
|
+
end
|
39
|
+
|
40
|
+
def find_or_create_by_uri(uri)
|
41
|
+
find(hash_uri(uri))
|
42
|
+
rescue
|
43
|
+
new(uri: uri)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def seed_attributes
|
48
|
+
Hash.new(
|
49
|
+
site_id: site_id,
|
50
|
+
market_id: market_id,
|
51
|
+
crawl_node_id: crawl_node_id,
|
52
|
+
external_region_node_id: external_region_node_id,
|
53
|
+
external_category_node_id: external_category_node_id,
|
54
|
+
referer: uri
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
def update_blank_attributes_only(attributes)
|
59
|
+
attributes.delete_if {|key, value| send(key) }
|
60
|
+
update_attributes(attributes)
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
# DO NOT CHANGE. This overwrites the default behavior
|
65
|
+
def populate_id
|
66
|
+
@_id = crawl_id
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/crawl_kit/s3.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
module CrawlKit
|
2
|
+
class S3
|
3
|
+
class << self
|
4
|
+
def connection
|
5
|
+
AWS::S3.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def buckets
|
9
|
+
connection.buckets
|
10
|
+
end
|
11
|
+
|
12
|
+
def crawl_bucket(bucket_name=nil)
|
13
|
+
bucket_name ||= CrawlKit.config.crawl_bucket
|
14
|
+
raise "A crawl bucket is required" unless bucket_name
|
15
|
+
buckets[bucket_name]
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_object(key)
|
19
|
+
crawl_bucket.objects[key]
|
20
|
+
end
|
21
|
+
|
22
|
+
def file_exists?(key)
|
23
|
+
get_object(key).exists?
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_file(key)
|
27
|
+
if file = get_object(key)
|
28
|
+
decompress(file.read)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def save_page(key, body, options)
|
33
|
+
options.merge!(content_type: 'text/html', content_encoding: 'deflate', storage_class: :reduced_redundancy)
|
34
|
+
object = get_object(key)
|
35
|
+
object.write(Zlib::Deflate.deflate(body), options)
|
36
|
+
end
|
37
|
+
|
38
|
+
def get_versions(hashed_uri)
|
39
|
+
s3_object = get_object(hashed_uri)
|
40
|
+
versions = s3_object.versions.to_a
|
41
|
+
end
|
42
|
+
|
43
|
+
# A step of 0 will give the current version.
|
44
|
+
def get_previous_version(hashed_uri, step=1)
|
45
|
+
previous_version = get_versions(hashed_uri).sort_by{|version| version.head.last_modified}.reverse[step]
|
46
|
+
decompress(previous_version.read) if previous_version
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def decompress(str)
|
51
|
+
body = Zlib::Inflate.inflate(str)
|
52
|
+
body.force_encoding(::Encoding::UTF_8)
|
53
|
+
body
|
54
|
+
end
|
55
|
+
|
56
|
+
end # Class Methods
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module CrawlKit
|
2
|
+
class SimpleDB
|
3
|
+
class << self
|
4
|
+
def connection
|
5
|
+
AWS::SimpleDB.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def crawl_domain(domain_name=nil)
|
9
|
+
domain_name ||= CrawlKit.config.crawl_domain
|
10
|
+
raise "A crawl domain is required" unless domain_name
|
11
|
+
connection.domains[domain_name]
|
12
|
+
end
|
13
|
+
|
14
|
+
def items
|
15
|
+
crawl_domain.items
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_item(hashed_uri)
|
19
|
+
crawl_domain.items[hashed_uri]
|
20
|
+
end
|
21
|
+
alias :find :get_item
|
22
|
+
|
23
|
+
def delete_item(hashed_uri) #replaced delete_attributes
|
24
|
+
crawl_domain.items[hashed_uri].try(:delete)
|
25
|
+
end
|
26
|
+
|
27
|
+
def select(params)
|
28
|
+
items.where(params)
|
29
|
+
end
|
30
|
+
|
31
|
+
def count(params)
|
32
|
+
items.where(params).count
|
33
|
+
end
|
34
|
+
|
35
|
+
def select_each(params)
|
36
|
+
select(params).each do |item|
|
37
|
+
yield(item)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def each_item(params)
|
42
|
+
select(params).each {|item| yield(item) }
|
43
|
+
end
|
44
|
+
end # Class Methods
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module CrawlKit
|
2
|
+
class SQS
|
3
|
+
class << self
|
4
|
+
def connection
|
5
|
+
@connection ||= AWS::SQS.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def get_queue(name="crawl_new")
|
9
|
+
queues.detect{|queue|queue.name.eql?(name)}
|
10
|
+
end
|
11
|
+
|
12
|
+
def queues
|
13
|
+
connection.queues.to_a
|
14
|
+
end
|
15
|
+
|
16
|
+
def create_queue(name)
|
17
|
+
connection.queues.create(name)
|
18
|
+
end
|
19
|
+
alias :get_or_create_queue :create_queue
|
20
|
+
|
21
|
+
def enqueue_message(queue, message)
|
22
|
+
queue.send_message(ActiveSupport::JSON.encode(message))
|
23
|
+
end
|
24
|
+
|
25
|
+
def consume_message(queue)
|
26
|
+
ActiveSupport::JSON.decode(queue.receive_message)
|
27
|
+
end
|
28
|
+
|
29
|
+
def clear_queue(name)
|
30
|
+
delete_queue(name)
|
31
|
+
sleep(70)
|
32
|
+
create_queue(name)
|
33
|
+
end
|
34
|
+
|
35
|
+
def delete_queue(name)
|
36
|
+
get_queue(name).try(:delete)
|
37
|
+
end
|
38
|
+
end # Class Methods
|
39
|
+
end
|
40
|
+
end
|
data/lib/crawl_kit.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
|
3
|
+
require 'crawl_kit/configuration'
|
4
|
+
|
5
|
+
module CrawlKit
|
6
|
+
|
7
|
+
class << self
|
8
|
+
@@config = nil
|
9
|
+
|
10
|
+
def config(options = {})
|
11
|
+
@@config ||= CrawlKit::Configuration.new
|
12
|
+
@@config = @@config.with(options) unless options.empty?
|
13
|
+
CrawlKit::Record.set_domain_name(@@config.crawl_domain) if @@config.crawl_domain
|
14
|
+
@@config
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'crawl_kit/record/helpers'
|
20
|
+
require 'crawl_kit/record'
|
21
|
+
require 'crawl_kit/s3'
|
22
|
+
require 'crawl_kit/simple_db'
|
23
|
+
require 'crawl_kit/sqs'
|
24
|
+
require 'extensions/aws/sqs/queue.rb'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module AWS
|
2
|
+
class SQS
|
3
|
+
class Queue
|
4
|
+
def name
|
5
|
+
url[/\/([^\/]+)$/, 1]
|
6
|
+
end
|
7
|
+
|
8
|
+
def enqueue(message)
|
9
|
+
send_message(ActiveSupport::JSON.encode(message))
|
10
|
+
end
|
11
|
+
|
12
|
+
def consume
|
13
|
+
message = receive_message
|
14
|
+
if message
|
15
|
+
message.delete
|
16
|
+
ActiveSupport::JSON.decode(message.body)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def size
|
21
|
+
approximate_number_of_messages
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawl_kit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Austin Cargol
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-12-08 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: aws-sdk
|
16
|
+
requirement: &70283953721820 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.2.3
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70283953721820
|
25
|
+
description:
|
26
|
+
email: acargol@gotime.com
|
27
|
+
executables: []
|
28
|
+
extensions: []
|
29
|
+
extra_rdoc_files: []
|
30
|
+
files:
|
31
|
+
- .gitignore
|
32
|
+
- Rakefile
|
33
|
+
- crawl_kit.gemspec
|
34
|
+
- lib/crawl_kit.rb
|
35
|
+
- lib/crawl_kit/configuration.rb
|
36
|
+
- lib/crawl_kit/record.rb
|
37
|
+
- lib/crawl_kit/record/helpers.rb
|
38
|
+
- lib/crawl_kit/s3.rb
|
39
|
+
- lib/crawl_kit/simple_db.rb
|
40
|
+
- lib/crawl_kit/sqs.rb
|
41
|
+
- lib/extensions/aws/sqs/queue.rb
|
42
|
+
homepage:
|
43
|
+
licenses: []
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.9.2
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ! '>='
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 1.8.10
|
63
|
+
signing_key:
|
64
|
+
specification_version: 3
|
65
|
+
summary: A collection of serivce interfaces and models to use with the GoTime crawling
|
66
|
+
infrastructure.
|
67
|
+
test_files: []
|