crawl_kit 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/Rakefile +13 -0
- data/crawl_kit.gemspec +16 -0
- data/lib/crawl_kit/configuration.rb +81 -0
- data/lib/crawl_kit/record/helpers.rb +37 -0
- data/lib/crawl_kit/record.rb +69 -0
- data/lib/crawl_kit/s3.rb +58 -0
- data/lib/crawl_kit/simple_db.rb +46 -0
- data/lib/crawl_kit/sqs.rb +40 -0
- data/lib/crawl_kit.rb +24 -0
- data/lib/extensions/aws/sqs/queue.rb +25 -0
- metadata +67 -0
data/.gitignore
ADDED
data/Rakefile
ADDED
data/crawl_kit.gemspec
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'crawl_kit'
|
3
|
+
s.version = '0.0.1'
|
4
|
+
s.date = '2011-12-08'
|
5
|
+
s.summary = "A collection of serivce interfaces and models to use with the GoTime crawling infrastructure."
|
6
|
+
s.author = "Austin Cargol"
|
7
|
+
s.email = 'acargol@gotime.com'
|
8
|
+
|
9
|
+
s.files = `git ls-files`.split("\n")
|
10
|
+
s.test_files = `git ls-files -- {test}/*`.split("\n")
|
11
|
+
s.require_paths = %w(lib)
|
12
|
+
|
13
|
+
s.required_ruby_version = '>= 1.9.2'
|
14
|
+
|
15
|
+
s.add_dependency('aws-sdk', '>= 1.2.3')
|
16
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'set'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module CrawlKit
|
5
|
+
class Configuration
|
6
|
+
|
7
|
+
# Creates a new Configuration object.
|
8
|
+
def initialize(options = {})
|
9
|
+
options.each_pair do |opt_name, value|
|
10
|
+
opt_name = opt_name.to_sym
|
11
|
+
if self.class.accepted_options.include?(opt_name)
|
12
|
+
supplied[opt_name] = value
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Used to create a new Configuration object with the given modifications.
|
18
|
+
# The current configuration object is not modified.
|
19
|
+
def with(options = {})
|
20
|
+
# symbolize option keys
|
21
|
+
options = options.inject({}) {|h,kv| h[kv.first.to_sym] = kv.last; h }
|
22
|
+
|
23
|
+
values = supplied.merge(options)
|
24
|
+
|
25
|
+
if supplied == values
|
26
|
+
self # nothing changed
|
27
|
+
else
|
28
|
+
self.class.new(values)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# @return [Hash] Returns a hash of all configuration values.
|
33
|
+
def to_h
|
34
|
+
self.class.accepted_options.inject({}) do |h,k|
|
35
|
+
h[k] = send(k)
|
36
|
+
h
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# @return [Boolean] Returns true if the two configuration objects have
|
41
|
+
# the same values.
|
42
|
+
def == other
|
43
|
+
other.is_a?(self.class) and self.supplied == other.supplied
|
44
|
+
end
|
45
|
+
|
46
|
+
alias_method :eql, :==
|
47
|
+
|
48
|
+
# @private
|
49
|
+
def inspect
|
50
|
+
"<#{self.class.name}>"
|
51
|
+
end
|
52
|
+
|
53
|
+
protected
|
54
|
+
def supplied
|
55
|
+
@supplied ||= {}
|
56
|
+
end
|
57
|
+
|
58
|
+
class << self
|
59
|
+
|
60
|
+
# @private
|
61
|
+
def accepted_options
|
62
|
+
@options ||= Set.new
|
63
|
+
end
|
64
|
+
|
65
|
+
# @private
|
66
|
+
def add_option(name, default_value = nil, options = {}, &transform)
|
67
|
+
accepted_options << name
|
68
|
+
|
69
|
+
define_method(name) do
|
70
|
+
value = supplied.has_key?(name) ? supplied[name] : default_value
|
71
|
+
transform ? transform.call(value) : value
|
72
|
+
end
|
73
|
+
|
74
|
+
alias_method("#{name}?", name) if options[:boolean]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
add_option :crawl_domain, ENV['CRAWL_DOMAIN']
|
79
|
+
add_option :crawl_bucket, ENV['CRAWL_BUCKET']
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module CrawlKit
|
2
|
+
class Record < AWS::Record::Base
|
3
|
+
module Helpers
|
4
|
+
|
5
|
+
def self.included(base)
|
6
|
+
base.extend(ClassMethods)
|
7
|
+
end
|
8
|
+
|
9
|
+
module ClassMethods
|
10
|
+
def digest
|
11
|
+
@digest ||= OpenSSL::Digest::Digest.new('sha1')
|
12
|
+
end
|
13
|
+
|
14
|
+
def hash_uri(uri=nil)
|
15
|
+
digest.hexdigest(uri)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def hashed_uri
|
20
|
+
self.class.digest.hexdigest(uri)
|
21
|
+
end
|
22
|
+
alias :crawl_id :hashed_uri
|
23
|
+
|
24
|
+
def fetched?
|
25
|
+
CrawlKit::S3.file_exists?(crawl_id)
|
26
|
+
end
|
27
|
+
|
28
|
+
def page
|
29
|
+
CrawlKit::S3.get_file(crawl_id)
|
30
|
+
end
|
31
|
+
|
32
|
+
def previous_version
|
33
|
+
CrawlKit::S3.get_previous_version(crawl_id)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module CrawlKit
|
2
|
+
class Record < AWS::Record::Base
|
3
|
+
include CrawlKit::Record::Helpers
|
4
|
+
|
5
|
+
# State Fields
|
6
|
+
string_attr :uri
|
7
|
+
string_attr :lifecycle, default_value: "new"
|
8
|
+
string_attr :page_type
|
9
|
+
string_attr :referer
|
10
|
+
integer_attr :site_id
|
11
|
+
integer_attr :market_id
|
12
|
+
integer_attr :crawl_node_id
|
13
|
+
integer_attr :external_region_node_id
|
14
|
+
integer_attr :external_category_node_id
|
15
|
+
|
16
|
+
validates_presence_of :uri
|
17
|
+
|
18
|
+
# Crawl Fields
|
19
|
+
string_attr :domain
|
20
|
+
string_attr :status
|
21
|
+
string_attr :status_code
|
22
|
+
string_attr :last_effective_url
|
23
|
+
string_attr :location
|
24
|
+
string_attr :x_city_id
|
25
|
+
string_attr :last_changed
|
26
|
+
string_attr :last_visited
|
27
|
+
string_attr :last_modified
|
28
|
+
string_attr :content_hash
|
29
|
+
string_attr :content_length
|
30
|
+
string_attr :content_type
|
31
|
+
string_attr :etag
|
32
|
+
string_attr :x_cvs_id
|
33
|
+
string_attr :x_category
|
34
|
+
|
35
|
+
class << self
|
36
|
+
def find(id)
|
37
|
+
id =~ /^[^\.]+$/ ? super(id) : super(hash_uri(id))
|
38
|
+
end
|
39
|
+
|
40
|
+
def find_or_create_by_uri(uri)
|
41
|
+
find(hash_uri(uri))
|
42
|
+
rescue
|
43
|
+
new(uri: uri)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def seed_attributes
|
48
|
+
Hash.new(
|
49
|
+
site_id: site_id,
|
50
|
+
market_id: market_id,
|
51
|
+
crawl_node_id: crawl_node_id,
|
52
|
+
external_region_node_id: external_region_node_id,
|
53
|
+
external_category_node_id: external_category_node_id,
|
54
|
+
referer: uri
|
55
|
+
)
|
56
|
+
end
|
57
|
+
|
58
|
+
def update_blank_attributes_only(attributes)
|
59
|
+
attributes.delete_if {|key, value| send(key) }
|
60
|
+
update_attributes(attributes)
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
# DO NOT CHANGE. This overwrites the default behavior
|
65
|
+
def populate_id
|
66
|
+
@_id = crawl_id
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/crawl_kit/s3.rb
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
module CrawlKit
|
2
|
+
class S3
|
3
|
+
class << self
|
4
|
+
def connection
|
5
|
+
AWS::S3.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def buckets
|
9
|
+
connection.buckets
|
10
|
+
end
|
11
|
+
|
12
|
+
def crawl_bucket(bucket_name=nil)
|
13
|
+
bucket_name ||= CrawlKit.config.crawl_bucket
|
14
|
+
raise "A crawl bucket is required" unless bucket_name
|
15
|
+
buckets[bucket_name]
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_object(key)
|
19
|
+
crawl_bucket.objects[key]
|
20
|
+
end
|
21
|
+
|
22
|
+
def file_exists?(key)
|
23
|
+
get_object(key).exists?
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_file(key)
|
27
|
+
if file = get_object(key)
|
28
|
+
decompress(file.read)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def save_page(key, body, options)
|
33
|
+
options.merge!(content_type: 'text/html', content_encoding: 'deflate', storage_class: :reduced_redundancy)
|
34
|
+
object = get_object(key)
|
35
|
+
object.write(Zlib::Deflate.deflate(body), options)
|
36
|
+
end
|
37
|
+
|
38
|
+
def get_versions(hashed_uri)
|
39
|
+
s3_object = get_object(hashed_uri)
|
40
|
+
versions = s3_object.versions.to_a
|
41
|
+
end
|
42
|
+
|
43
|
+
# A step of 0 will give the current version.
|
44
|
+
def get_previous_version(hashed_uri, step=1)
|
45
|
+
previous_version = get_versions(hashed_uri).sort_by{|version| version.head.last_modified}.reverse[step]
|
46
|
+
decompress(previous_version.read) if previous_version
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
def decompress(str)
|
51
|
+
body = Zlib::Inflate.inflate(str)
|
52
|
+
body.force_encoding(::Encoding::UTF_8)
|
53
|
+
body
|
54
|
+
end
|
55
|
+
|
56
|
+
end # Class Methods
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module CrawlKit
|
2
|
+
class SimpleDB
|
3
|
+
class << self
|
4
|
+
def connection
|
5
|
+
AWS::SimpleDB.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def crawl_domain(domain_name=nil)
|
9
|
+
domain_name ||= CrawlKit.config.crawl_domain
|
10
|
+
raise "A crawl domain is required" unless domain_name
|
11
|
+
connection.domains[domain_name]
|
12
|
+
end
|
13
|
+
|
14
|
+
def items
|
15
|
+
crawl_domain.items
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_item(hashed_uri)
|
19
|
+
crawl_domain.items[hashed_uri]
|
20
|
+
end
|
21
|
+
alias :find :get_item
|
22
|
+
|
23
|
+
def delete_item(hashed_uri) #replaced delete_attributes
|
24
|
+
crawl_domain.items[hashed_uri].try(:delete)
|
25
|
+
end
|
26
|
+
|
27
|
+
def select(params)
|
28
|
+
items.where(params)
|
29
|
+
end
|
30
|
+
|
31
|
+
def count(params)
|
32
|
+
items.where(params).count
|
33
|
+
end
|
34
|
+
|
35
|
+
def select_each(params)
|
36
|
+
select(params).each do |item|
|
37
|
+
yield(item)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def each_item(params)
|
42
|
+
select(params).each {|item| yield(item) }
|
43
|
+
end
|
44
|
+
end # Class Methods
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module CrawlKit
|
2
|
+
class SQS
|
3
|
+
class << self
|
4
|
+
def connection
|
5
|
+
@connection ||= AWS::SQS.new
|
6
|
+
end
|
7
|
+
|
8
|
+
def get_queue(name="crawl_new")
|
9
|
+
queues.detect{|queue|queue.name.eql?(name)}
|
10
|
+
end
|
11
|
+
|
12
|
+
def queues
|
13
|
+
connection.queues.to_a
|
14
|
+
end
|
15
|
+
|
16
|
+
def create_queue(name)
|
17
|
+
connection.queues.create(name)
|
18
|
+
end
|
19
|
+
alias :get_or_create_queue :create_queue
|
20
|
+
|
21
|
+
def enqueue_message(queue, message)
|
22
|
+
queue.send_message(ActiveSupport::JSON.encode(message))
|
23
|
+
end
|
24
|
+
|
25
|
+
def consume_message(queue)
|
26
|
+
ActiveSupport::JSON.decode(queue.receive_message)
|
27
|
+
end
|
28
|
+
|
29
|
+
def clear_queue(name)
|
30
|
+
delete_queue(name)
|
31
|
+
sleep(70)
|
32
|
+
create_queue(name)
|
33
|
+
end
|
34
|
+
|
35
|
+
def delete_queue(name)
|
36
|
+
get_queue(name).try(:delete)
|
37
|
+
end
|
38
|
+
end # Class Methods
|
39
|
+
end
|
40
|
+
end
|
data/lib/crawl_kit.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
|
3
|
+
require 'crawl_kit/configuration'
|
4
|
+
|
5
|
+
module CrawlKit
|
6
|
+
|
7
|
+
class << self
|
8
|
+
@@config = nil
|
9
|
+
|
10
|
+
def config(options = {})
|
11
|
+
@@config ||= CrawlKit::Configuration.new
|
12
|
+
@@config = @@config.with(options) unless options.empty?
|
13
|
+
CrawlKit::Record.set_domain_name(@@config.crawl_domain) if @@config.crawl_domain
|
14
|
+
@@config
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'crawl_kit/record/helpers'
|
20
|
+
require 'crawl_kit/record'
|
21
|
+
require 'crawl_kit/s3'
|
22
|
+
require 'crawl_kit/simple_db'
|
23
|
+
require 'crawl_kit/sqs'
|
24
|
+
require 'extensions/aws/sqs/queue.rb'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module AWS
|
2
|
+
class SQS
|
3
|
+
class Queue
|
4
|
+
def name
|
5
|
+
url[/\/([^\/]+)$/, 1]
|
6
|
+
end
|
7
|
+
|
8
|
+
def enqueue(message)
|
9
|
+
send_message(ActiveSupport::JSON.encode(message))
|
10
|
+
end
|
11
|
+
|
12
|
+
def consume
|
13
|
+
message = receive_message
|
14
|
+
if message
|
15
|
+
message.delete
|
16
|
+
ActiveSupport::JSON.decode(message.body)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def size
|
21
|
+
approximate_number_of_messages
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: crawl_kit
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Austin Cargol
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-12-08 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: aws-sdk
|
16
|
+
requirement: &70283953721820 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 1.2.3
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70283953721820
|
25
|
+
description:
|
26
|
+
email: acargol@gotime.com
|
27
|
+
executables: []
|
28
|
+
extensions: []
|
29
|
+
extra_rdoc_files: []
|
30
|
+
files:
|
31
|
+
- .gitignore
|
32
|
+
- Rakefile
|
33
|
+
- crawl_kit.gemspec
|
34
|
+
- lib/crawl_kit.rb
|
35
|
+
- lib/crawl_kit/configuration.rb
|
36
|
+
- lib/crawl_kit/record.rb
|
37
|
+
- lib/crawl_kit/record/helpers.rb
|
38
|
+
- lib/crawl_kit/s3.rb
|
39
|
+
- lib/crawl_kit/simple_db.rb
|
40
|
+
- lib/crawl_kit/sqs.rb
|
41
|
+
- lib/extensions/aws/sqs/queue.rb
|
42
|
+
homepage:
|
43
|
+
licenses: []
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.9.2
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ! '>='
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 1.8.10
|
63
|
+
signing_key:
|
64
|
+
specification_version: 3
|
65
|
+
summary: A collection of serivce interfaces and models to use with the GoTime crawling
|
66
|
+
infrastructure.
|
67
|
+
test_files: []
|