creepin 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in creepin.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jason Ayre
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # Creepin
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'creepin'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install creepin
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'creepin/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "creepin"
8
+ gem.version = Creepin::VERSION
9
+ gem.authors = ["Jason Ayre"]
10
+ gem.email = ["jasonayre@gmail.com"]
11
+ gem.description = %q{Creepin so logically}
12
+ gem.summary = %q{Provides structured crawling, and mapping, of external sites, to your ruby classes or AR models.}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_dependency "activesupport"
21
+ gem.add_dependency 'httparty'
22
+ gem.add_dependency 'aasm'
23
+ gem.add_dependency 'nokogiri'
24
+ end
@@ -0,0 +1,45 @@
1
+ require "creepin/version"
2
+ require 'aasm'
3
+ require 'uri'
4
+ require 'active_support'
5
+ require 'rack'
6
+ require 'httparty'
7
+ require 'nokogiri'
8
+ require 'creepin/collection_creeper'
9
+ require 'creepin/collection'
10
+ require 'creepin/resource_creeper'
11
+ require 'creepin/resource'
12
+
13
+ require 'creepin/on'
14
+
15
+ module Creepin
16
+ # Your code goes here...
17
+ @@loaded = false
18
+
19
+ def self.setup(&block)
20
+ load!
21
+ instance_eval(&block)
22
+ end
23
+
24
+ def self.load!
25
+ # No work to do if we've already loaded
26
+ return false if loaded?
27
+
28
+ # Load files
29
+ files_in_load_path.each{|file| load file }
30
+ @@loaded = true
31
+ end
32
+
33
+ def self.loaded?
34
+ @@loaded
35
+ end
36
+
37
+ def self.load_paths
38
+ [File.expand_path('app/creepers', Rails.root)]
39
+ end
40
+
41
+ def self.files_in_load_path
42
+ load_paths.flatten.compact.uniq.collect{|path| Dir["#{path}/**/*.rb"] }.flatten
43
+ end
44
+
45
+ end
@@ -0,0 +1,103 @@
1
+ module Creepin
2
+ class Collection
3
+
4
+ def initialize(name, options={}, &block)
5
+ @config = {}
6
+ @request_params = {}
7
+
8
+ instance_eval(&block)
9
+ settings = @config.dup.merge(namespace: name)
10
+ @request_params.merge(settings[:default_params]) if settings.has_key?(:default_params)
11
+ @request_params.reverse_merge(options[:params]) if options.has_key?(:params)
12
+
13
+ class_name = "#{name.camelize}CollectionCreeper"
14
+ puts class_name
15
+ klass = Class.new CollectionCreeper
16
+ dsl_methods = @config.keys
17
+ @creeper_class = Object.const_set(class_name, klass)
18
+
19
+ #seems to be a bug with AASM and inheritence, this should be in collection crawler file
20
+ @creeper_class.class_eval do
21
+
22
+ include AASM
23
+
24
+ aasm do
25
+ state :waiting, :initial => true
26
+ state :crawling
27
+ state :parsing
28
+ state :finished
29
+ state :finished_and_loaded
30
+
31
+ event :crawl, :after => :transmit do
32
+ transitions :from => :waiting, :to => :crawling
33
+ end
34
+
35
+ event :crawl_finished, :after => :run_after_crawl_finished_callbacks do
36
+ transitions :from => :crawling, :to => :finished
37
+ end
38
+
39
+ event :crawl_next, :after => :crawl_next_page do
40
+ transitions :from => :finished, :to => :waiting
41
+ end
42
+
43
+ event :collection_loaded, :after => :run_after_collection_loaded_callbacks do
44
+ transitions :from => [:waiting, :finished], :to => :finished_and_loaded
45
+ end
46
+
47
+ end
48
+
49
+ dsl_methods.each do |sym|
50
+ define_method(sym.to_s) {
51
+ settings[sym]
52
+ }
53
+ end
54
+
55
+ end
56
+
57
+ end
58
+
59
+ def base_url(string)
60
+ @config[:base_url] = string
61
+ end
62
+
63
+ def default_params(hash)
64
+ @config[:default_params] = hash
65
+ end
66
+
67
+ def selector(string)
68
+ @config[:selector] = string
69
+ end
70
+
71
+ def define_element_mapping(attr_name, &block)
72
+ @config[:element_mappings] ||= {}
73
+ @config[:element_mappings][attr_name] = block
74
+ end
75
+
76
+ def resource_class(string)
77
+ @config[:resource_class] = string
78
+ end
79
+
80
+ def next_page_selector(string='', &block)
81
+ @config[:next_page_selector] = string unless string.nil?
82
+ @config[:next_page_selector] = block if block
83
+ end
84
+
85
+ def skip_resource_save(bool)
86
+ @config[:skip_resource_save] = bool.present? ? bool : false
87
+ end
88
+
89
+ def resource_load_strategy(*options, &block)
90
+ @config[:resource_load_strategy] = Proc.new(*options, &block)
91
+ end
92
+
93
+ def resource_save_strategy(*options, &block)
94
+ @config[:resource_save_strategy] = Proc.new(*options, &block)
95
+ end
96
+
97
+ def after(event_name, &block)
98
+ @config["after_#{event_name.to_s}_callbacks".to_sym] ||= []
99
+ @config["after_#{event_name.to_s}_callbacks".to_sym] << block
100
+ end
101
+
102
+ end
103
+ end
@@ -0,0 +1,141 @@
1
+ module Creepin
2
+ class CollectionCreeper
3
+
4
+ attr_accessor :stats, :total_records, :total_pages, :loaded_collection, :started_at, :finished_at, :requested_urls
5
+
6
+ def initialize(params = {})
7
+ @params ||= {}
8
+ @params = params if params.present?
9
+ @total_records ||= 0
10
+ @total_pages ||= 0
11
+ @loaded_collection ||= []
12
+ @requested_urls ||= []
13
+ end
14
+
15
+ def run_after_crawl_callbacks
16
+ transmit
17
+ end
18
+
19
+ def run_after_crawl_finished_callbacks
20
+ parse_response
21
+ after_crawl_finished_callbacks.each{ |callback| callback.call(self) } if after_crawl_finished_callbacks?
22
+ crawl_next
23
+ end
24
+
25
+ def run_after_collection_loaded_callbacks
26
+ after_collection_loaded_callbacks.each{ |callback| callback.call(self) } if after_collection_loaded_callbacks?
27
+ end
28
+
29
+ def after_crawl_finished_callbacks?
30
+ (respond_to?(:after_crawl_finished_callbacks) && !after_crawl_finished_callbacks.empty?) ? true : false
31
+ end
32
+
33
+ def before_crawl_finished_callbacks?
34
+ (respond_to?(:before_crawl_finished_callbacks) && !before_crawl_finished_callbacks.empty?) ? true : false
35
+ end
36
+
37
+ def after_collection_loaded_callbacks?
38
+ (respond_to?(:after_collection_loaded_callbacks) && !after_collection_loaded_callbacks.empty?) ? true : false
39
+ end
40
+
41
+ def crawl_next_page
42
+ if next_page?
43
+ crawl
44
+ else
45
+ collection_loaded
46
+ end
47
+ end
48
+
49
+ def transmit
50
+ @request_params ||= (default_params? ? {:query => default_params.merge(@params) } : {:query => @params } )
51
+ @response = HTTParty.get(base_url, @request_params)
52
+ @requested_urls << full_request_url(base_url, @request_params)
53
+ @total_pages += 1
54
+ crawl_finished
55
+ end
56
+
57
+ def build_request_params(param_string)
58
+ params_hash = Rack::Utils.parse_query(param_string.split('?').pop)
59
+ @request_params = { :query => params_hash.with_indifferent_access } if params_hash.present?
60
+ end
61
+
62
+ def full_request_url(base_url, request_params)
63
+ base_url + request_params[:query].map{|k,v| "#{k}=#{v}"}.join("&").insert(0, '?')
64
+ end
65
+
66
+ def parse_response
67
+ @response_html = Nokogiri::HTML::Document.parse(@response.body)
68
+ load_response_collection
69
+ map_response_collection if response_collection?
70
+ end
71
+
72
+ def load_response_collection
73
+ @response_collection = @response_html.document.css(selector)
74
+ end
75
+
76
+ def response_collection?
77
+ @response_collection.present?
78
+ end
79
+
80
+ def load_resource(collected_attributes_hash, resource_klass)
81
+ if resource_load_strategy?
82
+ resource_load_strategy.call(collected_attributes_hash, resource_klass)
83
+ else
84
+ resource_klass.new(collected_attributes_hash)
85
+ end
86
+ end
87
+
88
+ def save_resource(collected_attributes_hash, resource)
89
+ if resource_save_strategy?
90
+ resource_save_strategy.call(collected_attributes_hash, resource)
91
+ else
92
+ resource.save unless skip_resource_save?
93
+ end
94
+ end
95
+
96
+ def skip_resource_save?
97
+ respond_to?(:skip_resource_save)
98
+ end
99
+
100
+ def resource_load_strategy?
101
+ respond_to?(:resource_load_strategy)
102
+ end
103
+
104
+ def resource_save_strategy?
105
+ respond_to?(:resource_save_strategy)
106
+ end
107
+
108
+ def default_params?
109
+ respond_to?(:default_params)
110
+ end
111
+
112
+ def map_response_collection
113
+ @response_collection.each do |ele|
114
+ collected_attributes_hash = {}
115
+ element_mappings.each_pair do |attribute, block|
116
+ value = instance_exec(ele, &block)
117
+ collected_attributes_hash[attribute] = value
118
+ end
119
+ resource = load_resource(collected_attributes_hash, resource_class.constantize)
120
+ @total_records += 1
121
+ resource = save_resource(collected_attributes_hash, resource)
122
+ loaded_collection << resource
123
+ end
124
+ end
125
+
126
+ def next_page?
127
+ return false if next_page_selector.nil?
128
+ if next_page_selector.is_a?(Proc)
129
+ next_page_url = instance_exec(@response_html.document, &next_page_selector)
130
+ build_request_params(next_page_url) if next_page_url.present?
131
+ @has_next_page = next_page_url.present?
132
+ else
133
+ next_page_url = @response_html.document.at_css(next_page_selector)
134
+ build_request_params(next_page_url) if next_page_url.present?
135
+ @has_next_page = next_page_url.present?
136
+ end
137
+ @has_next_page
138
+ end
139
+
140
+ end
141
+ end
@@ -0,0 +1,21 @@
1
+ module Creepin
2
+
3
+ class On
4
+
5
+ def initialize(name, &block)
6
+ @config = {}
7
+ @name = name
8
+ instance_eval(&block)
9
+ end
10
+
11
+ def collection(*options, &block)
12
+ Creepin::Collection.new(@name, *options, &block)
13
+ end
14
+
15
+ def resource(*options, &block)
16
+ Creepin::Resource.new(@name, *options, &block)
17
+ end
18
+
19
+ end
20
+
21
+ end
@@ -0,0 +1,91 @@
1
+ module Creepin
2
+
3
+ class Resource
4
+
5
+ def initialize(name, options={}, &block)
6
+ @config = {}
7
+ @request_params = {}
8
+
9
+ instance_eval(&block)
10
+ settings = @config.dup.merge(namespace: name)
11
+ @request_params.merge(settings[:default_params]) if settings.has_key?(:default_params)
12
+ @request_params.reverse_merge(options[:params]) if options.has_key?(:params)
13
+ @loaded_resource = options[:loaded_resource] if options.has_key?(:loaded_resource)
14
+
15
+ class_name = "#{name.camelize}ResourceCreeper"
16
+ klass = Class.new ResourceCreeper
17
+ dsl_methods = @config.keys
18
+ creeper_class = Object.const_set(class_name, klass)
19
+
20
+ #seems to be a bug with AASM and inheritence, this should be in collection crawler file
21
+ creeper_class.class_eval do
22
+
23
+ include AASM
24
+
25
+ aasm do
26
+ state :waiting, :initial => true
27
+ state :crawling
28
+ state :parsing
29
+ state :finished
30
+ state :finished_and_loaded
31
+
32
+ event :crawl, :after => :transmit do
33
+ transitions :from => :waiting, :to => :crawling
34
+ end
35
+
36
+ event :crawl_finished, :after => :parse_response do
37
+ transitions :from => :crawling, :to => :finished
38
+ end
39
+
40
+ end
41
+
42
+ dsl_methods.each do |sym|
43
+ define_method(sym.to_s) {
44
+ settings[sym]
45
+ }
46
+ end
47
+
48
+ end
49
+
50
+ end
51
+
52
+ def base_url(string)
53
+ @config[:base_url] = string
54
+ end
55
+
56
+ def default_params(hash)
57
+ @config[:default_params] = hash
58
+ end
59
+
60
+ def url_attribute(sym)
61
+ @config[:url_attribute] = sym.to_sym
62
+ end
63
+
64
+ def selector(string)
65
+ @config[:selector] = string
66
+ end
67
+
68
+ def define_element_mapping(attr_name, &block)
69
+ @config[:element_mappings] ||= {}
70
+ @config[:element_mappings][attr_name] = block
71
+ end
72
+
73
+ def resource_class(string)
74
+ @config[:resource_class] = string
75
+ end
76
+
77
+ def skip_resource_save(bool)
78
+ @config[:skip_resource_save] = bool.present? ? bool : false
79
+ end
80
+
81
+ def resource_load_strategy(*options, &block)
82
+ @config[:resource_load_strategy] = Proc.new(*options, &block)
83
+ end
84
+
85
+ def resource_save_strategy(*options, &block)
86
+ @config[:resource_save_strategy] = Proc.new(*options, &block)
87
+ end
88
+
89
+ end
90
+
91
+ end
@@ -0,0 +1,97 @@
1
+ module Creepin
2
+
3
+ class ResourceCreeper
4
+
5
+ attr_accessor :collected_attributes_hash, :requested_url, :loaded_resource
6
+
7
+ def initialize(loaded_resource, params = {})
8
+ @params ||= {}
9
+ @params = params if params.present?
10
+ @loaded_resource = loaded_resource
11
+ @collected_attributes_hash = {}
12
+ end
13
+
14
+ def run_after_crawl_callbacks
15
+ transmit
16
+ end
17
+
18
+ def transmit
19
+ if url_attribute?
20
+ @response = HTTParty.get(loaded_resource.send(url_attribute))
21
+ @requested_url = loaded_resource.send(url_attribute)
22
+ else
23
+ @request_params ||= {:query => default_params.merge!(@params) }
24
+ @response = HTTParty.get(base_url, @request_params)
25
+ @requested_url = full_request_url(base_url, @request_params)
26
+ end
27
+
28
+ crawl_finished
29
+ end
30
+
31
+ def url_attribute?
32
+ respond_to?(:url_attribute)
33
+ end
34
+
35
+ def build_request_params(param_string)
36
+ params_hash = Rack::Utils.parse_query(param_string.split('?').pop)
37
+ @request_params = { :query => params_hash.with_indifferent_access } if params_hash.present?
38
+ end
39
+
40
+ def full_request_url(base_url, request_params)
41
+ base_url + request_params[:query].map{|k,v| "#{k}=#{v}"}.join("&").insert(0, '?')
42
+ end
43
+
44
+ def parse_response
45
+ @response_html = Nokogiri::HTML::Document.parse(@response.body)
46
+ load_response_resource
47
+ map_response_resource if response_resource?
48
+ end
49
+
50
+ def load_response_resource
51
+ @response_resource = @response_html.document.at_css(selector)
52
+ end
53
+
54
+ def map_response_resource
55
+
56
+ element_mappings.each_pair do |attribute, block|
57
+ value = instance_exec(@response_resource, &block)
58
+ collected_attributes_hash[attribute] = value
59
+ end
60
+
61
+ resource = save_resource(collected_attributes_hash, loaded_resource)
62
+
63
+ end
64
+
65
+ def response_resource?
66
+ @response_resource.present? ? true : false
67
+ end
68
+
69
+ def load_resource(collected_attributes_hash, resource_klass)
70
+ if resource_load_strategy?
71
+ resource_load_strategy.call(collected_attributes_hash, resource_klass)
72
+ else
73
+ resource_klass.new(collected_attributes_hash)
74
+ end
75
+ end
76
+
77
+ def save_resource(collected_attributes_hash, resource)
78
+ if resource_save_strategy?
79
+ resource_save_strategy.call(collected_attributes_hash, resource)
80
+ else
81
+ collected_attributes_hash.each_pair{|k,v| resource.send("#{k}=", v) }
82
+ resource.save unless skip_resource_save?
83
+ resource
84
+ end
85
+ end
86
+
87
+ def resource_save_strategy?
88
+ respond_to?(:resource_save_strategy)
89
+ end
90
+
91
+ def skip_resource_save?
92
+ respond_to?(:skip_resource_save)
93
+ end
94
+
95
+ end
96
+
97
+ end
@@ -0,0 +1,3 @@
1
+ module Creepin
2
+ VERSION = "0.0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: creepin
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jason Ayre
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-07 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: activesupport
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: httparty
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: aasm
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: nokogiri
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Creepin so logically
79
+ email:
80
+ - jasonayre@gmail.com
81
+ executables: []
82
+ extensions: []
83
+ extra_rdoc_files: []
84
+ files:
85
+ - .gitignore
86
+ - Gemfile
87
+ - LICENSE.txt
88
+ - README.md
89
+ - Rakefile
90
+ - creepin.gemspec
91
+ - lib/creepin.rb
92
+ - lib/creepin/collection.rb
93
+ - lib/creepin/collection_creeper.rb
94
+ - lib/creepin/on.rb
95
+ - lib/creepin/resource.rb
96
+ - lib/creepin/resource_creeper.rb
97
+ - lib/creepin/version.rb
98
+ homepage: ''
99
+ licenses: []
100
+ post_install_message:
101
+ rdoc_options: []
102
+ require_paths:
103
+ - lib
104
+ required_ruby_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ! '>='
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ requirements: []
117
+ rubyforge_project:
118
+ rubygems_version: 1.8.24
119
+ signing_key:
120
+ specification_version: 3
121
+ summary: Provides structured crawling, and mapping, of external sites, to your ruby
122
+ classes or AR models.
123
+ test_files: []