creepin 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in creepin.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jason Ayre
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # Creepin
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'creepin'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install creepin
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'creepin/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "creepin"
8
+ gem.version = Creepin::VERSION
9
+ gem.authors = ["Jason Ayre"]
10
+ gem.email = ["jasonayre@gmail.com"]
11
+ gem.description = %q{Creepin so logically}
12
+ gem.summary = %q{Provides structured crawling, and mapping, of external sites, to your ruby classes or AR models.}
13
+ gem.homepage = ""
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_dependency "activesupport"
21
+ gem.add_dependency 'httparty'
22
+ gem.add_dependency 'aasm'
23
+ gem.add_dependency 'nokogiri'
24
+ end
@@ -0,0 +1,45 @@
1
+ require "creepin/version"
2
+ require 'aasm'
3
+ require 'uri'
4
+ require 'active_support'
5
+ require 'rack'
6
+ require 'httparty'
7
+ require 'nokogiri'
8
+ require 'creepin/collection_creeper'
9
+ require 'creepin/collection'
10
+ require 'creepin/resource_creeper'
11
+ require 'creepin/resource'
12
+
13
+ require 'creepin/on'
14
+
15
+ module Creepin
16
+ # Your code goes here...
17
+ @@loaded = false
18
+
19
+ def self.setup(&block)
20
+ load!
21
+ instance_eval(&block)
22
+ end
23
+
24
+ def self.load!
25
+ # No work to do if we've already loaded
26
+ return false if loaded?
27
+
28
+ # Load files
29
+ files_in_load_path.each{|file| load file }
30
+ @@loaded = true
31
+ end
32
+
33
+ def self.loaded?
34
+ @@loaded
35
+ end
36
+
37
+ def self.load_paths
38
+ [File.expand_path('app/creepers', Rails.root)]
39
+ end
40
+
41
+ def self.files_in_load_path
42
+ load_paths.flatten.compact.uniq.collect{|path| Dir["#{path}/**/*.rb"] }.flatten
43
+ end
44
+
45
+ end
@@ -0,0 +1,103 @@
1
+ module Creepin
2
+ class Collection
3
+
4
+ def initialize(name, options={}, &block)
5
+ @config = {}
6
+ @request_params = {}
7
+
8
+ instance_eval(&block)
9
+ settings = @config.dup.merge(namespace: name)
10
+ @request_params.merge(settings[:default_params]) if settings.has_key?(:default_params)
11
+ @request_params.reverse_merge(options[:params]) if options.has_key?(:params)
12
+
13
+ class_name = "#{name.camelize}CollectionCreeper"
14
+ puts class_name
15
+ klass = Class.new CollectionCreeper
16
+ dsl_methods = @config.keys
17
+ @creeper_class = Object.const_set(class_name, klass)
18
+
19
+ #seems to be a bug with AASM and inheritence, this should be in collection crawler file
20
+ @creeper_class.class_eval do
21
+
22
+ include AASM
23
+
24
+ aasm do
25
+ state :waiting, :initial => true
26
+ state :crawling
27
+ state :parsing
28
+ state :finished
29
+ state :finished_and_loaded
30
+
31
+ event :crawl, :after => :transmit do
32
+ transitions :from => :waiting, :to => :crawling
33
+ end
34
+
35
+ event :crawl_finished, :after => :run_after_crawl_finished_callbacks do
36
+ transitions :from => :crawling, :to => :finished
37
+ end
38
+
39
+ event :crawl_next, :after => :crawl_next_page do
40
+ transitions :from => :finished, :to => :waiting
41
+ end
42
+
43
+ event :collection_loaded, :after => :run_after_collection_loaded_callbacks do
44
+ transitions :from => [:waiting, :finished], :to => :finished_and_loaded
45
+ end
46
+
47
+ end
48
+
49
+ dsl_methods.each do |sym|
50
+ define_method(sym.to_s) {
51
+ settings[sym]
52
+ }
53
+ end
54
+
55
+ end
56
+
57
+ end
58
+
59
+ def base_url(string)
60
+ @config[:base_url] = string
61
+ end
62
+
63
+ def default_params(hash)
64
+ @config[:default_params] = hash
65
+ end
66
+
67
+ def selector(string)
68
+ @config[:selector] = string
69
+ end
70
+
71
+ def define_element_mapping(attr_name, &block)
72
+ @config[:element_mappings] ||= {}
73
+ @config[:element_mappings][attr_name] = block
74
+ end
75
+
76
+ def resource_class(string)
77
+ @config[:resource_class] = string
78
+ end
79
+
80
+ def next_page_selector(string='', &block)
81
+ @config[:next_page_selector] = string unless string.nil?
82
+ @config[:next_page_selector] = block if block
83
+ end
84
+
85
+ def skip_resource_save(bool)
86
+ @config[:skip_resource_save] = bool.present? ? bool : false
87
+ end
88
+
89
+ def resource_load_strategy(*options, &block)
90
+ @config[:resource_load_strategy] = Proc.new(*options, &block)
91
+ end
92
+
93
+ def resource_save_strategy(*options, &block)
94
+ @config[:resource_save_strategy] = Proc.new(*options, &block)
95
+ end
96
+
97
+ def after(event_name, &block)
98
+ @config["after_#{event_name.to_s}_callbacks".to_sym] ||= []
99
+ @config["after_#{event_name.to_s}_callbacks".to_sym] << block
100
+ end
101
+
102
+ end
103
+ end
@@ -0,0 +1,141 @@
1
+ module Creepin
2
+ class CollectionCreeper
3
+
4
+ attr_accessor :stats, :total_records, :total_pages, :loaded_collection, :started_at, :finished_at, :requested_urls
5
+
6
+ def initialize(params = {})
7
+ @params ||= {}
8
+ @params = params if params.present?
9
+ @total_records ||= 0
10
+ @total_pages ||= 0
11
+ @loaded_collection ||= []
12
+ @requested_urls ||= []
13
+ end
14
+
15
+ def run_after_crawl_callbacks
16
+ transmit
17
+ end
18
+
19
+ def run_after_crawl_finished_callbacks
20
+ parse_response
21
+ after_crawl_finished_callbacks.each{ |callback| callback.call(self) } if after_crawl_finished_callbacks?
22
+ crawl_next
23
+ end
24
+
25
+ def run_after_collection_loaded_callbacks
26
+ after_collection_loaded_callbacks.each{ |callback| callback.call(self) } if after_collection_loaded_callbacks?
27
+ end
28
+
29
+ def after_crawl_finished_callbacks?
30
+ (respond_to?(:after_crawl_finished_callbacks) && !after_crawl_finished_callbacks.empty?) ? true : false
31
+ end
32
+
33
+ def before_crawl_finished_callbacks?
34
+ (respond_to?(:before_crawl_finished_callbacks) && !before_crawl_finished_callbacks.empty?) ? true : false
35
+ end
36
+
37
+ def after_collection_loaded_callbacks?
38
+ (respond_to?(:after_collection_loaded_callbacks) && !after_collection_loaded_callbacks.empty?) ? true : false
39
+ end
40
+
41
+ def crawl_next_page
42
+ if next_page?
43
+ crawl
44
+ else
45
+ collection_loaded
46
+ end
47
+ end
48
+
49
+ def transmit
50
+ @request_params ||= (default_params? ? {:query => default_params.merge(@params) } : {:query => @params } )
51
+ @response = HTTParty.get(base_url, @request_params)
52
+ @requested_urls << full_request_url(base_url, @request_params)
53
+ @total_pages += 1
54
+ crawl_finished
55
+ end
56
+
57
+ def build_request_params(param_string)
58
+ params_hash = Rack::Utils.parse_query(param_string.split('?').pop)
59
+ @request_params = { :query => params_hash.with_indifferent_access } if params_hash.present?
60
+ end
61
+
62
+ def full_request_url(base_url, request_params)
63
+ base_url + request_params[:query].map{|k,v| "#{k}=#{v}"}.join("&").insert(0, '?')
64
+ end
65
+
66
+ def parse_response
67
+ @response_html = Nokogiri::HTML::Document.parse(@response.body)
68
+ load_response_collection
69
+ map_response_collection if response_collection?
70
+ end
71
+
72
+ def load_response_collection
73
+ @response_collection = @response_html.document.css(selector)
74
+ end
75
+
76
+ def response_collection?
77
+ @response_collection.present?
78
+ end
79
+
80
+ def load_resource(collected_attributes_hash, resource_klass)
81
+ if resource_load_strategy?
82
+ resource_load_strategy.call(collected_attributes_hash, resource_klass)
83
+ else
84
+ resource_klass.new(collected_attributes_hash)
85
+ end
86
+ end
87
+
88
+ def save_resource(collected_attributes_hash, resource)
89
+ if resource_save_strategy?
90
+ resource_save_strategy.call(collected_attributes_hash, resource)
91
+ else
92
+ resource.save unless skip_resource_save?
93
+ end
94
+ end
95
+
96
+ def skip_resource_save?
97
+ respond_to?(:skip_resource_save)
98
+ end
99
+
100
+ def resource_load_strategy?
101
+ respond_to?(:resource_load_strategy)
102
+ end
103
+
104
+ def resource_save_strategy?
105
+ respond_to?(:resource_save_strategy)
106
+ end
107
+
108
+ def default_params?
109
+ respond_to?(:default_params)
110
+ end
111
+
112
+ def map_response_collection
113
+ @response_collection.each do |ele|
114
+ collected_attributes_hash = {}
115
+ element_mappings.each_pair do |attribute, block|
116
+ value = instance_exec(ele, &block)
117
+ collected_attributes_hash[attribute] = value
118
+ end
119
+ resource = load_resource(collected_attributes_hash, resource_class.constantize)
120
+ @total_records += 1
121
+ resource = save_resource(collected_attributes_hash, resource)
122
+ loaded_collection << resource
123
+ end
124
+ end
125
+
126
+ def next_page?
127
+ return false if next_page_selector.nil?
128
+ if next_page_selector.is_a?(Proc)
129
+ next_page_url = instance_exec(@response_html.document, &next_page_selector)
130
+ build_request_params(next_page_url) if next_page_url.present?
131
+ @has_next_page = next_page_url.present?
132
+ else
133
+ next_page_url = @response_html.document.at_css(next_page_selector)
134
+ build_request_params(next_page_url) if next_page_url.present?
135
+ @has_next_page = next_page_url.present?
136
+ end
137
+ @has_next_page
138
+ end
139
+
140
+ end
141
+ end
@@ -0,0 +1,21 @@
1
+ module Creepin
2
+
3
+ class On
4
+
5
+ def initialize(name, &block)
6
+ @config = {}
7
+ @name = name
8
+ instance_eval(&block)
9
+ end
10
+
11
+ def collection(*options, &block)
12
+ Creepin::Collection.new(@name, *options, &block)
13
+ end
14
+
15
+ def resource(*options, &block)
16
+ Creepin::Resource.new(@name, *options, &block)
17
+ end
18
+
19
+ end
20
+
21
+ end
@@ -0,0 +1,91 @@
1
+ module Creepin
2
+
3
+ class Resource
4
+
5
+ def initialize(name, options={}, &block)
6
+ @config = {}
7
+ @request_params = {}
8
+
9
+ instance_eval(&block)
10
+ settings = @config.dup.merge(namespace: name)
11
+ @request_params.merge(settings[:default_params]) if settings.has_key?(:default_params)
12
+ @request_params.reverse_merge(options[:params]) if options.has_key?(:params)
13
+ @loaded_resource = options[:loaded_resource] if options.has_key?(:loaded_resource)
14
+
15
+ class_name = "#{name.camelize}ResourceCreeper"
16
+ klass = Class.new ResourceCreeper
17
+ dsl_methods = @config.keys
18
+ creeper_class = Object.const_set(class_name, klass)
19
+
20
+ #seems to be a bug with AASM and inheritence, this should be in collection crawler file
21
+ creeper_class.class_eval do
22
+
23
+ include AASM
24
+
25
+ aasm do
26
+ state :waiting, :initial => true
27
+ state :crawling
28
+ state :parsing
29
+ state :finished
30
+ state :finished_and_loaded
31
+
32
+ event :crawl, :after => :transmit do
33
+ transitions :from => :waiting, :to => :crawling
34
+ end
35
+
36
+ event :crawl_finished, :after => :parse_response do
37
+ transitions :from => :crawling, :to => :finished
38
+ end
39
+
40
+ end
41
+
42
+ dsl_methods.each do |sym|
43
+ define_method(sym.to_s) {
44
+ settings[sym]
45
+ }
46
+ end
47
+
48
+ end
49
+
50
+ end
51
+
52
+ def base_url(string)
53
+ @config[:base_url] = string
54
+ end
55
+
56
+ def default_params(hash)
57
+ @config[:default_params] = hash
58
+ end
59
+
60
+ def url_attribute(sym)
61
+ @config[:url_attribute] = sym.to_sym
62
+ end
63
+
64
+ def selector(string)
65
+ @config[:selector] = string
66
+ end
67
+
68
+ def define_element_mapping(attr_name, &block)
69
+ @config[:element_mappings] ||= {}
70
+ @config[:element_mappings][attr_name] = block
71
+ end
72
+
73
+ def resource_class(string)
74
+ @config[:resource_class] = string
75
+ end
76
+
77
+ def skip_resource_save(bool)
78
+ @config[:skip_resource_save] = bool.present? ? bool : false
79
+ end
80
+
81
+ def resource_load_strategy(*options, &block)
82
+ @config[:resource_load_strategy] = Proc.new(*options, &block)
83
+ end
84
+
85
+ def resource_save_strategy(*options, &block)
86
+ @config[:resource_save_strategy] = Proc.new(*options, &block)
87
+ end
88
+
89
+ end
90
+
91
+ end
@@ -0,0 +1,97 @@
1
+ module Creepin
2
+
3
+ class ResourceCreeper
4
+
5
+ attr_accessor :collected_attributes_hash, :requested_url, :loaded_resource
6
+
7
+ def initialize(loaded_resource, params = {})
8
+ @params ||= {}
9
+ @params = params if params.present?
10
+ @loaded_resource = loaded_resource
11
+ @collected_attributes_hash = {}
12
+ end
13
+
14
+ def run_after_crawl_callbacks
15
+ transmit
16
+ end
17
+
18
+ def transmit
19
+ if url_attribute?
20
+ @response = HTTParty.get(loaded_resource.send(url_attribute))
21
+ @requested_url = loaded_resource.send(url_attribute)
22
+ else
23
+ @request_params ||= {:query => default_params.merge!(@params) }
24
+ @response = HTTParty.get(base_url, @request_params)
25
+ @requested_url = full_request_url(base_url, @request_params)
26
+ end
27
+
28
+ crawl_finished
29
+ end
30
+
31
+ def url_attribute?
32
+ respond_to?(:url_attribute)
33
+ end
34
+
35
+ def build_request_params(param_string)
36
+ params_hash = Rack::Utils.parse_query(param_string.split('?').pop)
37
+ @request_params = { :query => params_hash.with_indifferent_access } if params_hash.present?
38
+ end
39
+
40
+ def full_request_url(base_url, request_params)
41
+ base_url + request_params[:query].map{|k,v| "#{k}=#{v}"}.join("&").insert(0, '?')
42
+ end
43
+
44
+ def parse_response
45
+ @response_html = Nokogiri::HTML::Document.parse(@response.body)
46
+ load_response_resource
47
+ map_response_resource if response_resource?
48
+ end
49
+
50
+ def load_response_resource
51
+ @response_resource = @response_html.document.at_css(selector)
52
+ end
53
+
54
+ def map_response_resource
55
+
56
+ element_mappings.each_pair do |attribute, block|
57
+ value = instance_exec(@response_resource, &block)
58
+ collected_attributes_hash[attribute] = value
59
+ end
60
+
61
+ resource = save_resource(collected_attributes_hash, loaded_resource)
62
+
63
+ end
64
+
65
+ def response_resource?
66
+ @response_resource.present? ? true : false
67
+ end
68
+
69
+ def load_resource(collected_attributes_hash, resource_klass)
70
+ if resource_load_strategy?
71
+ resource_load_strategy.call(collected_attributes_hash, resource_klass)
72
+ else
73
+ resource_klass.new(collected_attributes_hash)
74
+ end
75
+ end
76
+
77
+ def save_resource(collected_attributes_hash, resource)
78
+ if resource_save_strategy?
79
+ resource_save_strategy.call(collected_attributes_hash, resource)
80
+ else
81
+ collected_attributes_hash.each_pair{|k,v| resource.send("#{k}=", v) }
82
+ resource.save unless skip_resource_save?
83
+ resource
84
+ end
85
+ end
86
+
87
+ def resource_save_strategy?
88
+ respond_to?(:resource_save_strategy)
89
+ end
90
+
91
+ def skip_resource_save?
92
+ respond_to?(:skip_resource_save)
93
+ end
94
+
95
+ end
96
+
97
+ end
@@ -0,0 +1,3 @@
1
+ module Creepin
2
+ VERSION = "0.0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: creepin
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Jason Ayre
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-01-07 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: activesupport
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: httparty
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: aasm
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: nokogiri
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Creepin so logically
79
+ email:
80
+ - jasonayre@gmail.com
81
+ executables: []
82
+ extensions: []
83
+ extra_rdoc_files: []
84
+ files:
85
+ - .gitignore
86
+ - Gemfile
87
+ - LICENSE.txt
88
+ - README.md
89
+ - Rakefile
90
+ - creepin.gemspec
91
+ - lib/creepin.rb
92
+ - lib/creepin/collection.rb
93
+ - lib/creepin/collection_creeper.rb
94
+ - lib/creepin/on.rb
95
+ - lib/creepin/resource.rb
96
+ - lib/creepin/resource_creeper.rb
97
+ - lib/creepin/version.rb
98
+ homepage: ''
99
+ licenses: []
100
+ post_install_message:
101
+ rdoc_options: []
102
+ require_paths:
103
+ - lib
104
+ required_ruby_version: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ required_rubygems_version: !ruby/object:Gem::Requirement
111
+ none: false
112
+ requirements:
113
+ - - ! '>='
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ requirements: []
117
+ rubyforge_project:
118
+ rubygems_version: 1.8.24
119
+ signing_key:
120
+ specification_version: 3
121
+ summary: Provides structured crawling, and mapping, of external sites, to your ruby
122
+ classes or AR models.
123
+ test_files: []