crawl_kit 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ *.gem
2
+ .DS_Store
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env rake
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << 'lib'
7
+ t.libs << 'test'
8
+ t.pattern = 'test/**/*_test.rb'
9
+ t.verbose = false
10
+ end
11
+
12
+ desc "Run tests"
13
+ task :default => :test
data/crawl_kit.gemspec ADDED
@@ -0,0 +1,16 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'crawl_kit'
3
+ s.version = '0.0.1'
4
+ s.date = '2011-12-08'
5
+ s.summary = "A collection of serivce interfaces and models to use with the GoTime crawling infrastructure."
6
+ s.author = "Austin Cargol"
7
+ s.email = 'acargol@gotime.com'
8
+
9
+ s.files = `git ls-files`.split("\n")
10
+ s.test_files = `git ls-files -- {test}/*`.split("\n")
11
+ s.require_paths = %w(lib)
12
+
13
+ s.required_ruby_version = '>= 1.9.2'
14
+
15
+ s.add_dependency('aws-sdk', '>= 1.2.3')
16
+ end
@@ -0,0 +1,81 @@
1
+ require 'set'
2
+ require 'uri'
3
+
4
+ module CrawlKit
5
+ class Configuration
6
+
7
+ # Creates a new Configuration object.
8
+ def initialize(options = {})
9
+ options.each_pair do |opt_name, value|
10
+ opt_name = opt_name.to_sym
11
+ if self.class.accepted_options.include?(opt_name)
12
+ supplied[opt_name] = value
13
+ end
14
+ end
15
+ end
16
+
17
+ # Used to create a new Configuration object with the given modifications.
18
+ # The current configuration object is not modified.
19
+ def with(options = {})
20
+ # symbolize option keys
21
+ options = options.inject({}) {|h,kv| h[kv.first.to_sym] = kv.last; h }
22
+
23
+ values = supplied.merge(options)
24
+
25
+ if supplied == values
26
+ self # nothing changed
27
+ else
28
+ self.class.new(values)
29
+ end
30
+ end
31
+
32
+ # @return [Hash] Returns a hash of all configuration values.
33
+ def to_h
34
+ self.class.accepted_options.inject({}) do |h,k|
35
+ h[k] = send(k)
36
+ h
37
+ end
38
+ end
39
+
40
+ # @return [Boolean] Returns true if the two configuration objects have
41
+ # the same values.
42
+ def == other
43
+ other.is_a?(self.class) and self.supplied == other.supplied
44
+ end
45
+
46
+ alias_method :eql, :==
47
+
48
+ # @private
49
+ def inspect
50
+ "<#{self.class.name}>"
51
+ end
52
+
53
+ protected
54
+ def supplied
55
+ @supplied ||= {}
56
+ end
57
+
58
+ class << self
59
+
60
+ # @private
61
+ def accepted_options
62
+ @options ||= Set.new
63
+ end
64
+
65
+ # @private
66
+ def add_option(name, default_value = nil, options = {}, &transform)
67
+ accepted_options << name
68
+
69
+ define_method(name) do
70
+ value = supplied.has_key?(name) ? supplied[name] : default_value
71
+ transform ? transform.call(value) : value
72
+ end
73
+
74
+ alias_method("#{name}?", name) if options[:boolean]
75
+ end
76
+ end
77
+
78
+ add_option :crawl_domain, ENV['CRAWL_DOMAIN']
79
+ add_option :crawl_bucket, ENV['CRAWL_BUCKET']
80
+ end
81
+ end
@@ -0,0 +1,37 @@
1
+ module CrawlKit
2
+ class Record < AWS::Record::Base
3
+ module Helpers
4
+
5
+ def self.included(base)
6
+ base.extend(ClassMethods)
7
+ end
8
+
9
+ module ClassMethods
10
+ def digest
11
+ @digest ||= OpenSSL::Digest::Digest.new('sha1')
12
+ end
13
+
14
+ def hash_uri(uri=nil)
15
+ digest.hexdigest(uri)
16
+ end
17
+ end
18
+
19
+ def hashed_uri
20
+ self.class.digest.hexdigest(uri)
21
+ end
22
+ alias :crawl_id :hashed_uri
23
+
24
+ def fetched?
25
+ CrawlKit::S3.file_exists?(crawl_id)
26
+ end
27
+
28
+ def page
29
+ CrawlKit::S3.get_file(crawl_id)
30
+ end
31
+
32
+ def previous_version
33
+ CrawlKit::S3.get_previous_version(crawl_id)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,69 @@
1
+ module CrawlKit
2
+ class Record < AWS::Record::Base
3
+ include CrawlKit::Record::Helpers
4
+
5
+ # State Fields
6
+ string_attr :uri
7
+ string_attr :lifecycle, default_value: "new"
8
+ string_attr :page_type
9
+ string_attr :referer
10
+ integer_attr :site_id
11
+ integer_attr :market_id
12
+ integer_attr :crawl_node_id
13
+ integer_attr :external_region_node_id
14
+ integer_attr :external_category_node_id
15
+
16
+ validates_presence_of :uri
17
+
18
+ # Crawl Fields
19
+ string_attr :domain
20
+ string_attr :status
21
+ string_attr :status_code
22
+ string_attr :last_effective_url
23
+ string_attr :location
24
+ string_attr :x_city_id
25
+ string_attr :last_changed
26
+ string_attr :last_visited
27
+ string_attr :last_modified
28
+ string_attr :content_hash
29
+ string_attr :content_length
30
+ string_attr :content_type
31
+ string_attr :etag
32
+ string_attr :x_cvs_id
33
+ string_attr :x_category
34
+
35
+ class << self
36
+ def find(id)
37
+ id =~ /^[^\.]+$/ ? super(id) : super(hash_uri(id))
38
+ end
39
+
40
+ def find_or_create_by_uri(uri)
41
+ find(hash_uri(uri))
42
+ rescue
43
+ new(uri: uri)
44
+ end
45
+ end
46
+
47
+ def seed_attributes
48
+ Hash.new(
49
+ site_id: site_id,
50
+ market_id: market_id,
51
+ crawl_node_id: crawl_node_id,
52
+ external_region_node_id: external_region_node_id,
53
+ external_category_node_id: external_category_node_id,
54
+ referer: uri
55
+ )
56
+ end
57
+
58
+ def update_blank_attributes_only(attributes)
59
+ attributes.delete_if {|key, value| send(key) }
60
+ update_attributes(attributes)
61
+ end
62
+
63
+ private
64
+ # DO NOT CHANGE. This overwrites the default behavior
65
+ def populate_id
66
+ @_id = crawl_id
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,58 @@
1
+ module CrawlKit
2
+ class S3
3
+ class << self
4
+ def connection
5
+ AWS::S3.new
6
+ end
7
+
8
+ def buckets
9
+ connection.buckets
10
+ end
11
+
12
+ def crawl_bucket(bucket_name=nil)
13
+ bucket_name ||= CrawlKit.config.crawl_bucket
14
+ raise "A crawl bucket is required" unless bucket_name
15
+ buckets[bucket_name]
16
+ end
17
+
18
+ def get_object(key)
19
+ crawl_bucket.objects[key]
20
+ end
21
+
22
+ def file_exists?(key)
23
+ get_object(key).exists?
24
+ end
25
+
26
+ def get_file(key)
27
+ if file = get_object(key)
28
+ decompress(file.read)
29
+ end
30
+ end
31
+
32
+ def save_page(key, body, options)
33
+ options.merge!(content_type: 'text/html', content_encoding: 'deflate', storage_class: :reduced_redundancy)
34
+ object = get_object(key)
35
+ object.write(Zlib::Deflate.deflate(body), options)
36
+ end
37
+
38
+ def get_versions(hashed_uri)
39
+ s3_object = get_object(hashed_uri)
40
+ versions = s3_object.versions.to_a
41
+ end
42
+
43
+ # A step of 0 will give the current version.
44
+ def get_previous_version(hashed_uri, step=1)
45
+ previous_version = get_versions(hashed_uri).sort_by{|version| version.head.last_modified}.reverse[step]
46
+ decompress(previous_version.read) if previous_version
47
+ end
48
+
49
+ private
50
+ def decompress(str)
51
+ body = Zlib::Inflate.inflate(str)
52
+ body.force_encoding(::Encoding::UTF_8)
53
+ body
54
+ end
55
+
56
+ end # Class Methods
57
+ end
58
+ end
@@ -0,0 +1,46 @@
1
+ module CrawlKit
2
+ class SimpleDB
3
+ class << self
4
+ def connection
5
+ AWS::SimpleDB.new
6
+ end
7
+
8
+ def crawl_domain(domain_name=nil)
9
+ domain_name ||= CrawlKit.config.crawl_domain
10
+ raise "A crawl domain is required" unless domain_name
11
+ connection.domains[domain_name]
12
+ end
13
+
14
+ def items
15
+ crawl_domain.items
16
+ end
17
+
18
+ def get_item(hashed_uri)
19
+ crawl_domain.items[hashed_uri]
20
+ end
21
+ alias :find :get_item
22
+
23
+ def delete_item(hashed_uri) #replaced delete_attributes
24
+ crawl_domain.items[hashed_uri].try(:delete)
25
+ end
26
+
27
+ def select(params)
28
+ items.where(params)
29
+ end
30
+
31
+ def count(params)
32
+ items.where(params).count
33
+ end
34
+
35
+ def select_each(params)
36
+ select(params).each do |item|
37
+ yield(item)
38
+ end
39
+ end
40
+
41
+ def each_item(params)
42
+ select(params).each {|item| yield(item) }
43
+ end
44
+ end # Class Methods
45
+ end
46
+ end
@@ -0,0 +1,40 @@
1
+ module CrawlKit
2
+ class SQS
3
+ class << self
4
+ def connection
5
+ @connection ||= AWS::SQS.new
6
+ end
7
+
8
+ def get_queue(name="crawl_new")
9
+ queues.detect{|queue|queue.name.eql?(name)}
10
+ end
11
+
12
+ def queues
13
+ connection.queues.to_a
14
+ end
15
+
16
+ def create_queue(name)
17
+ connection.queues.create(name)
18
+ end
19
+ alias :get_or_create_queue :create_queue
20
+
21
+ def enqueue_message(queue, message)
22
+ queue.send_message(ActiveSupport::JSON.encode(message))
23
+ end
24
+
25
+ def consume_message(queue)
26
+ ActiveSupport::JSON.decode(queue.receive_message)
27
+ end
28
+
29
+ def clear_queue(name)
30
+ delete_queue(name)
31
+ sleep(70)
32
+ create_queue(name)
33
+ end
34
+
35
+ def delete_queue(name)
36
+ get_queue(name).try(:delete)
37
+ end
38
+ end # Class Methods
39
+ end
40
+ end
data/lib/crawl_kit.rb ADDED
@@ -0,0 +1,24 @@
1
+ require 'aws-sdk'
2
+
3
+ require 'crawl_kit/configuration'
4
+
5
+ module CrawlKit
6
+
7
+ class << self
8
+ @@config = nil
9
+
10
+ def config(options = {})
11
+ @@config ||= CrawlKit::Configuration.new
12
+ @@config = @@config.with(options) unless options.empty?
13
+ CrawlKit::Record.set_domain_name(@@config.crawl_domain) if @@config.crawl_domain
14
+ @@config
15
+ end
16
+ end
17
+ end
18
+
19
+ require 'crawl_kit/record/helpers'
20
+ require 'crawl_kit/record'
21
+ require 'crawl_kit/s3'
22
+ require 'crawl_kit/simple_db'
23
+ require 'crawl_kit/sqs'
24
+ require 'extensions/aws/sqs/queue.rb'
@@ -0,0 +1,25 @@
1
+ module AWS
2
+ class SQS
3
+ class Queue
4
+ def name
5
+ url[/\/([^\/]+)$/, 1]
6
+ end
7
+
8
+ def enqueue(message)
9
+ send_message(ActiveSupport::JSON.encode(message))
10
+ end
11
+
12
+ def consume
13
+ message = receive_message
14
+ if message
15
+ message.delete
16
+ ActiveSupport::JSON.decode(message.body)
17
+ end
18
+ end
19
+
20
+ def size
21
+ approximate_number_of_messages
22
+ end
23
+ end
24
+ end
25
+ end
metadata ADDED
@@ -0,0 +1,67 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crawl_kit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Austin Cargol
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-12-08 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: aws-sdk
16
+ requirement: &70283953721820 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.2.3
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70283953721820
25
+ description:
26
+ email: acargol@gotime.com
27
+ executables: []
28
+ extensions: []
29
+ extra_rdoc_files: []
30
+ files:
31
+ - .gitignore
32
+ - Rakefile
33
+ - crawl_kit.gemspec
34
+ - lib/crawl_kit.rb
35
+ - lib/crawl_kit/configuration.rb
36
+ - lib/crawl_kit/record.rb
37
+ - lib/crawl_kit/record/helpers.rb
38
+ - lib/crawl_kit/s3.rb
39
+ - lib/crawl_kit/simple_db.rb
40
+ - lib/crawl_kit/sqs.rb
41
+ - lib/extensions/aws/sqs/queue.rb
42
+ homepage:
43
+ licenses: []
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: 1.9.2
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ! '>='
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ requirements: []
61
+ rubyforge_project:
62
+ rubygems_version: 1.8.10
63
+ signing_key:
64
+ specification_version: 3
65
+ summary: A collection of serivce interfaces and models to use with the GoTime crawling
66
+ infrastructure.
67
+ test_files: []