crawl_kit 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ *.gem
2
+ .DS_Store
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env rake
2
+
3
+ require 'rake/testtask'
4
+
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << 'lib'
7
+ t.libs << 'test'
8
+ t.pattern = 'test/**/*_test.rb'
9
+ t.verbose = false
10
+ end
11
+
12
+ desc "Run tests"
13
+ task :default => :test
data/crawl_kit.gemspec ADDED
@@ -0,0 +1,16 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'crawl_kit'
3
+ s.version = '0.0.1'
4
+ s.date = '2011-12-08'
5
+ s.summary = "A collection of serivce interfaces and models to use with the GoTime crawling infrastructure."
6
+ s.author = "Austin Cargol"
7
+ s.email = 'acargol@gotime.com'
8
+
9
+ s.files = `git ls-files`.split("\n")
10
+ s.test_files = `git ls-files -- {test}/*`.split("\n")
11
+ s.require_paths = %w(lib)
12
+
13
+ s.required_ruby_version = '>= 1.9.2'
14
+
15
+ s.add_dependency('aws-sdk', '>= 1.2.3')
16
+ end
@@ -0,0 +1,81 @@
1
+ require 'set'
2
+ require 'uri'
3
+
4
+ module CrawlKit
5
+ class Configuration
6
+
7
+ # Creates a new Configuration object.
8
+ def initialize(options = {})
9
+ options.each_pair do |opt_name, value|
10
+ opt_name = opt_name.to_sym
11
+ if self.class.accepted_options.include?(opt_name)
12
+ supplied[opt_name] = value
13
+ end
14
+ end
15
+ end
16
+
17
+ # Used to create a new Configuration object with the given modifications.
18
+ # The current configuration object is not modified.
19
+ def with(options = {})
20
+ # symbolize option keys
21
+ options = options.inject({}) {|h,kv| h[kv.first.to_sym] = kv.last; h }
22
+
23
+ values = supplied.merge(options)
24
+
25
+ if supplied == values
26
+ self # nothing changed
27
+ else
28
+ self.class.new(values)
29
+ end
30
+ end
31
+
32
+ # @return [Hash] Returns a hash of all configuration values.
33
+ def to_h
34
+ self.class.accepted_options.inject({}) do |h,k|
35
+ h[k] = send(k)
36
+ h
37
+ end
38
+ end
39
+
40
+ # @return [Boolean] Returns true if the two configuration objects have
41
+ # the same values.
42
+ def == other
43
+ other.is_a?(self.class) and self.supplied == other.supplied
44
+ end
45
+
46
+ alias_method :eql, :==
47
+
48
+ # @private
49
+ def inspect
50
+ "<#{self.class.name}>"
51
+ end
52
+
53
+ protected
54
+ def supplied
55
+ @supplied ||= {}
56
+ end
57
+
58
+ class << self
59
+
60
+ # @private
61
+ def accepted_options
62
+ @options ||= Set.new
63
+ end
64
+
65
+ # @private
66
+ def add_option(name, default_value = nil, options = {}, &transform)
67
+ accepted_options << name
68
+
69
+ define_method(name) do
70
+ value = supplied.has_key?(name) ? supplied[name] : default_value
71
+ transform ? transform.call(value) : value
72
+ end
73
+
74
+ alias_method("#{name}?", name) if options[:boolean]
75
+ end
76
+ end
77
+
78
+ add_option :crawl_domain, ENV['CRAWL_DOMAIN']
79
+ add_option :crawl_bucket, ENV['CRAWL_BUCKET']
80
+ end
81
+ end
@@ -0,0 +1,37 @@
1
+ module CrawlKit
2
+ class Record < AWS::Record::Base
3
+ module Helpers
4
+
5
+ def self.included(base)
6
+ base.extend(ClassMethods)
7
+ end
8
+
9
+ module ClassMethods
10
+ def digest
11
+ @digest ||= OpenSSL::Digest::Digest.new('sha1')
12
+ end
13
+
14
+ def hash_uri(uri=nil)
15
+ digest.hexdigest(uri)
16
+ end
17
+ end
18
+
19
+ def hashed_uri
20
+ self.class.digest.hexdigest(uri)
21
+ end
22
+ alias :crawl_id :hashed_uri
23
+
24
+ def fetched?
25
+ CrawlKit::S3.file_exists?(crawl_id)
26
+ end
27
+
28
+ def page
29
+ CrawlKit::S3.get_file(crawl_id)
30
+ end
31
+
32
+ def previous_version
33
+ CrawlKit::S3.get_previous_version(crawl_id)
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,69 @@
1
+ module CrawlKit
2
+ class Record < AWS::Record::Base
3
+ include CrawlKit::Record::Helpers
4
+
5
+ # State Fields
6
+ string_attr :uri
7
+ string_attr :lifecycle, default_value: "new"
8
+ string_attr :page_type
9
+ string_attr :referer
10
+ integer_attr :site_id
11
+ integer_attr :market_id
12
+ integer_attr :crawl_node_id
13
+ integer_attr :external_region_node_id
14
+ integer_attr :external_category_node_id
15
+
16
+ validates_presence_of :uri
17
+
18
+ # Crawl Fields
19
+ string_attr :domain
20
+ string_attr :status
21
+ string_attr :status_code
22
+ string_attr :last_effective_url
23
+ string_attr :location
24
+ string_attr :x_city_id
25
+ string_attr :last_changed
26
+ string_attr :last_visited
27
+ string_attr :last_modified
28
+ string_attr :content_hash
29
+ string_attr :content_length
30
+ string_attr :content_type
31
+ string_attr :etag
32
+ string_attr :x_cvs_id
33
+ string_attr :x_category
34
+
35
+ class << self
36
+ def find(id)
37
+ id =~ /^[^\.]+$/ ? super(id) : super(hash_uri(id))
38
+ end
39
+
40
+ def find_or_create_by_uri(uri)
41
+ find(hash_uri(uri))
42
+ rescue
43
+ new(uri: uri)
44
+ end
45
+ end
46
+
47
+ def seed_attributes
48
+ Hash.new(
49
+ site_id: site_id,
50
+ market_id: market_id,
51
+ crawl_node_id: crawl_node_id,
52
+ external_region_node_id: external_region_node_id,
53
+ external_category_node_id: external_category_node_id,
54
+ referer: uri
55
+ )
56
+ end
57
+
58
+ def update_blank_attributes_only(attributes)
59
+ attributes.delete_if {|key, value| send(key) }
60
+ update_attributes(attributes)
61
+ end
62
+
63
+ private
64
+ # DO NOT CHANGE. This overwrites the default behavior
65
+ def populate_id
66
+ @_id = crawl_id
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,58 @@
1
+ module CrawlKit
2
+ class S3
3
+ class << self
4
+ def connection
5
+ AWS::S3.new
6
+ end
7
+
8
+ def buckets
9
+ connection.buckets
10
+ end
11
+
12
+ def crawl_bucket(bucket_name=nil)
13
+ bucket_name ||= CrawlKit.config.crawl_bucket
14
+ raise "A crawl bucket is required" unless bucket_name
15
+ buckets[bucket_name]
16
+ end
17
+
18
+ def get_object(key)
19
+ crawl_bucket.objects[key]
20
+ end
21
+
22
+ def file_exists?(key)
23
+ get_object(key).exists?
24
+ end
25
+
26
+ def get_file(key)
27
+ if file = get_object(key)
28
+ decompress(file.read)
29
+ end
30
+ end
31
+
32
+ def save_page(key, body, options)
33
+ options.merge!(content_type: 'text/html', content_encoding: 'deflate', storage_class: :reduced_redundancy)
34
+ object = get_object(key)
35
+ object.write(Zlib::Deflate.deflate(body), options)
36
+ end
37
+
38
+ def get_versions(hashed_uri)
39
+ s3_object = get_object(hashed_uri)
40
+ versions = s3_object.versions.to_a
41
+ end
42
+
43
+ # A step of 0 will give the current version.
44
+ def get_previous_version(hashed_uri, step=1)
45
+ previous_version = get_versions(hashed_uri).sort_by{|version| version.head.last_modified}.reverse[step]
46
+ decompress(previous_version.read) if previous_version
47
+ end
48
+
49
+ private
50
+ def decompress(str)
51
+ body = Zlib::Inflate.inflate(str)
52
+ body.force_encoding(::Encoding::UTF_8)
53
+ body
54
+ end
55
+
56
+ end # Class Methods
57
+ end
58
+ end
@@ -0,0 +1,46 @@
1
+ module CrawlKit
2
+ class SimpleDB
3
+ class << self
4
+ def connection
5
+ AWS::SimpleDB.new
6
+ end
7
+
8
+ def crawl_domain(domain_name=nil)
9
+ domain_name ||= CrawlKit.config.crawl_domain
10
+ raise "A crawl domain is required" unless domain_name
11
+ connection.domains[domain_name]
12
+ end
13
+
14
+ def items
15
+ crawl_domain.items
16
+ end
17
+
18
+ def get_item(hashed_uri)
19
+ crawl_domain.items[hashed_uri]
20
+ end
21
+ alias :find :get_item
22
+
23
+ def delete_item(hashed_uri) #replaced delete_attributes
24
+ crawl_domain.items[hashed_uri].try(:delete)
25
+ end
26
+
27
+ def select(params)
28
+ items.where(params)
29
+ end
30
+
31
+ def count(params)
32
+ items.where(params).count
33
+ end
34
+
35
+ def select_each(params)
36
+ select(params).each do |item|
37
+ yield(item)
38
+ end
39
+ end
40
+
41
+ def each_item(params)
42
+ select(params).each {|item| yield(item) }
43
+ end
44
+ end # Class Methods
45
+ end
46
+ end
@@ -0,0 +1,40 @@
1
+ module CrawlKit
2
+ class SQS
3
+ class << self
4
+ def connection
5
+ @connection ||= AWS::SQS.new
6
+ end
7
+
8
+ def get_queue(name="crawl_new")
9
+ queues.detect{|queue|queue.name.eql?(name)}
10
+ end
11
+
12
+ def queues
13
+ connection.queues.to_a
14
+ end
15
+
16
+ def create_queue(name)
17
+ connection.queues.create(name)
18
+ end
19
+ alias :get_or_create_queue :create_queue
20
+
21
+ def enqueue_message(queue, message)
22
+ queue.send_message(ActiveSupport::JSON.encode(message))
23
+ end
24
+
25
+ def consume_message(queue)
26
+ ActiveSupport::JSON.decode(queue.receive_message)
27
+ end
28
+
29
+ def clear_queue(name)
30
+ delete_queue(name)
31
+ sleep(70)
32
+ create_queue(name)
33
+ end
34
+
35
+ def delete_queue(name)
36
+ get_queue(name).try(:delete)
37
+ end
38
+ end # Class Methods
39
+ end
40
+ end
data/lib/crawl_kit.rb ADDED
@@ -0,0 +1,24 @@
1
+ require 'aws-sdk'
2
+
3
+ require 'crawl_kit/configuration'
4
+
5
+ module CrawlKit
6
+
7
+ class << self
8
+ @@config = nil
9
+
10
+ def config(options = {})
11
+ @@config ||= CrawlKit::Configuration.new
12
+ @@config = @@config.with(options) unless options.empty?
13
+ CrawlKit::Record.set_domain_name(@@config.crawl_domain) if @@config.crawl_domain
14
+ @@config
15
+ end
16
+ end
17
+ end
18
+
19
+ require 'crawl_kit/record/helpers'
20
+ require 'crawl_kit/record'
21
+ require 'crawl_kit/s3'
22
+ require 'crawl_kit/simple_db'
23
+ require 'crawl_kit/sqs'
24
+ require 'extensions/aws/sqs/queue.rb'
@@ -0,0 +1,25 @@
1
+ module AWS
2
+ class SQS
3
+ class Queue
4
+ def name
5
+ url[/\/([^\/]+)$/, 1]
6
+ end
7
+
8
+ def enqueue(message)
9
+ send_message(ActiveSupport::JSON.encode(message))
10
+ end
11
+
12
+ def consume
13
+ message = receive_message
14
+ if message
15
+ message.delete
16
+ ActiveSupport::JSON.decode(message.body)
17
+ end
18
+ end
19
+
20
+ def size
21
+ approximate_number_of_messages
22
+ end
23
+ end
24
+ end
25
+ end
metadata ADDED
@@ -0,0 +1,67 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: crawl_kit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Austin Cargol
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-12-08 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: aws-sdk
16
+ requirement: &70283953721820 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: 1.2.3
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70283953721820
25
+ description:
26
+ email: acargol@gotime.com
27
+ executables: []
28
+ extensions: []
29
+ extra_rdoc_files: []
30
+ files:
31
+ - .gitignore
32
+ - Rakefile
33
+ - crawl_kit.gemspec
34
+ - lib/crawl_kit.rb
35
+ - lib/crawl_kit/configuration.rb
36
+ - lib/crawl_kit/record.rb
37
+ - lib/crawl_kit/record/helpers.rb
38
+ - lib/crawl_kit/s3.rb
39
+ - lib/crawl_kit/simple_db.rb
40
+ - lib/crawl_kit/sqs.rb
41
+ - lib/extensions/aws/sqs/queue.rb
42
+ homepage:
43
+ licenses: []
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: 1.9.2
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ! '>='
58
+ - !ruby/object:Gem::Version
59
+ version: '0'
60
+ requirements: []
61
+ rubyforge_project:
62
+ rubygems_version: 1.8.10
63
+ signing_key:
64
+ specification_version: 3
65
+ summary: A collection of serivce interfaces and models to use with the GoTime crawling
66
+ infrastructure.
67
+ test_files: []