klepto 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. data/.gitignore +21 -0
  2. data/.rspec +2 -0
  3. data/.rvmrc +1 -0
  4. data/Gemfile +18 -0
  5. data/Guardfile +11 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +129 -0
  8. data/Rakefile +7 -0
  9. data/klepto.gemspec +26 -0
  10. data/lib/klepto.rb +26 -0
  11. data/lib/klepto/bot.rb +59 -0
  12. data/lib/klepto/browser.rb +18 -0
  13. data/lib/klepto/crawler.rb +72 -0
  14. data/lib/klepto/tasks.rb +15 -0
  15. data/lib/klepto/version.rb +3 -0
  16. data/samples/example.rb +49 -0
  17. data/spec/cassettes/Klepto_Crawler/dsl_interaction/should_crawl_the_resource.yml +1960 -0
  18. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_set_of_nodes.yml +114 -0
  19. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_block.yml +114 -0
  20. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_designate_scraping_of_a_single_node_with_a_symbol.yml +114 -0
  21. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_be_able_to_scrape_the_node_that_the_crawler_is_scoped_to.yml +114 -0
  22. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_CSS_scope.yml +114 -0
  23. data/spec/cassettes/Klepto_Crawler/standard_interaction/should_have_a_desired_syntax.yml +114 -0
  24. data/spec/lib/klepto/bot_spec.rb +40 -0
  25. data/spec/lib/klepto/browser_spec.rb +15 -0
  26. data/spec/lib/klepto/crawler_spec.rb +88 -0
  27. data/spec/lib/klepto/dsl_spec.rb +6 -0
  28. data/spec/lib/klepto_spec.rb +64 -0
  29. data/spec/orm/active_record.rb +36 -0
  30. data/spec/orm/database.example.yml +15 -0
  31. data/spec/spec_helper.rb +32 -0
  32. metadata +157 -0
@@ -0,0 +1,21 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ log
14
+ rdoc
15
+ spec/reports
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
19
+ spec/orm/database.yml
20
+ .DS_Store
21
+ **/.DS_Store
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use ruby-1.9.3-p194@klepto
data/Gemfile ADDED
@@ -0,0 +1,18 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in klepto.gemspec
4
+ gemspec
5
+ gem 'rspec', '2.12.0'
6
+ gem 'rb-fsevent'
7
+ gem "guard", '1.6.1'
8
+ gem "guard-bundler"
9
+ gem "guard-rspec"
10
+ gem 'debugger', '1.2.0'
11
+ gem 'vcr'
12
+ gem 'fakeweb'
13
+ #gem 'activerecord', "~>3.2.0"
14
+ gem 'activerecord'
15
+ gem 'mysql2'
16
+ gem 'rb-fsevent'
17
+ gem 'ruby_gntp'
18
+ gem 'simplecov', :require => false
@@ -0,0 +1,11 @@
1
+ guard 'bundler' do
2
+ watch('Gemfile')
3
+ # Uncomment next line if Gemfile contain `gemspec' command
4
+ # watch(/^.+\.gemspec/)
5
+ end
6
+
7
+ guard 'rspec', :version => 2 do
8
+ watch(%r{^spec/.+_spec\.rb$})
9
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
10
+ watch('spec/spec_helper.rb') { "spec" }
11
+ end
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Cory O'Daniel
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,129 @@
1
+ # Klepto
2
+
3
+ A mean little DSL'd capybara (poltergeist) based web crawler that stuffs data into your Rails app.
4
+
5
+ ## Features
6
+
7
+ * CSS or XPath Syntax
8
+ * Full javascript processing via phantomjs / poltergeist
9
+ * All the fun of capybara
10
+ * Scrape multiple pages with a single bot
11
+ * Scrape individuals pages with multiple 'crawlers', see Bieber example.
12
+ * Test coverage!
13
+
14
+ ## Usage
15
+ Say you want a bunch of Bieb tweets! How is there not profit in that?
16
+
17
+ ```ruby
18
+ # Make a bot
19
+ @bot = Klepto::Bot.new do
20
+ # Set your selector syntax. You can change to :xpath if you are 40+ or love C#.
21
+ syntax :css
22
+
23
+ # Send some headers, confuse everyone.
24
+ headers 'Referer' => 'http://www.twitter.com'
25
+
26
+ # The more the merrier. It takes a *splat.
27
+ urls 'https://twitter.com/justinbieber'
28
+
29
+ # Crawl the body of the page to get the user info
30
+ crawl 'body' do
31
+ # The default handler is to call .text on the scraped node.
32
+ scrape "h1.fullname", :name
33
+
34
+ # Scrape finds the first matching element for the given selector within
35
+ # the scope above (here: 'body')
36
+ scrape '.username span.screen-name', :username
37
+
38
+ # Scrape all matching elements with #scrape_all
39
+ scrape_all 'span.url a' do |nodes|
40
+ {
41
+ links: nodes.map{|n| n[:href]}
42
+ }
43
+ end
44
+
45
+ # Each 'match' of the crawlers selector (here: 'body') will have the
46
+ # content from 'scrape' passed in as a hash
47
+ save do |params|
48
+ user = User.find_by_name(params[:username]) || User.new
49
+ user.update_attributes params
50
+ end
51
+ end
52
+
53
+ # Get dem tweets
54
+ crawl 'li.stream-item' do
55
+ # Passing no parameters to scrape will set the context to be the
56
+ # outer matched crawled element (here: 'li.stream-item')
57
+ scrape do |node|
58
+ {:twitter_id => node['data-item-id']}
59
+ end
60
+
61
+ # Put '.content p' into params[:content]
62
+ scrape '.content p', :content
63
+
64
+ # Pass a block for more control
65
+ scrape '._timestamp' do |node|
66
+ {timestamp: node['data-time']}
67
+ end
68
+
69
+ scrape '.time a' do |node|
70
+ {permalink: node[:href]}
71
+ end
72
+
73
+ # Each 'match' of the crawlers selector (here: 'li.stream-item') will have the
74
+ # content from 'scrape' passed in as a hash
75
+ save do |params|
76
+ tweet = Tweet.find_by_twitter_id(params[:twitter_id]) || Tweet.new
77
+ tweet.update_attributes params
78
+ end
79
+ end
80
+ end
81
+
82
+ #start that bot for some sweet victory, heart throb.
83
+ @bot.start!
84
+ ```
85
+
86
+ All your content are belong to us.
87
+
88
+
89
+
90
+ ## Stuff I'm going to add.
91
+
92
+ event handlers...
93
+ --------------------
94
+
95
+ on_http_status(500,404) do |response, bot|
96
+ email('admin@example.com', bot.status, bot.summary)
97
+ end
98
+ on_assertion_failure{ |response, bot| }
99
+ on_invalid_resource{ |resource, bot| }
100
+
101
+ Pre-req Steps
102
+ --------------------
103
+
104
+ prepare [
105
+ [:GET, 'http://example.com'],
106
+ [:POST, 'http://example.com/login', {username: 'cory', password: '123456'}],
107
+ ]
108
+
109
+ Page Assertions
110
+ --------------------
111
+
112
+ assertions do
113
+ present 'li.offer'
114
+ present 'h3 a', :present => [:href]
115
+ within 'li.offer' do
116
+ present 'h3'
117
+ end
118
+
119
+ scrape 'h3 a' do |node|
120
+ node.is_a_link_to_someplace_we_like
121
+ end
122
+ end
123
+
124
+ Cookie Stufing
125
+ -------------------
126
+
127
+ cookies({
128
+ 'Has Fun' => true
129
+ })
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new('spec')
5
+
6
+ # If you want to make this the default task
7
+ task :default => :spec
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'klepto/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "klepto"
8
+ gem.version = Klepto::VERSION
9
+ gem.authors = ["Cory O'Daniel"]
10
+ gem.email = ["github@coryodaniel.com"]
11
+ gem.description = "Tearing up web pages into ActiveRecord resources"
12
+ gem.summary = "Tearing up web pages into ActiveRecord resources"
13
+ gem.homepage = "http://github.com/coryodaniel/klepto"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
+ gem.require_paths = ["lib"]
19
+ gem.add_dependency "docile"
20
+ gem.add_dependency "poltergeist", '1.1.0'
21
+ gem.add_dependency "capybara", '2.0.2'
22
+ #gem.add_dependency "thor"
23
+ gem.add_dependency "nokogiri", '~> 1.5.6'
24
+ gem.add_dependency "activesupport"
25
+ gem.add_dependency 'multi_json', '~> 1.0'
26
+ end
@@ -0,0 +1,26 @@
1
+ require 'docile'
2
+ require 'open-uri'
3
+ require 'logger'
4
+ require "capybara"
5
+ require "capybara/dsl"
6
+ require 'capybara/poltergeist'
7
+ require 'pp'
8
+
9
+ Capybara.run_server = false
10
+
11
+ Capybara.register_driver :poltergeist do |app|
12
+ Capybara::Poltergeist::Driver.new(app, {
13
+ js_errors: false
14
+ })
15
+ end
16
+ Capybara.current_driver = :poltergeist
17
+
18
+ module Klepto
19
+ LOG = Logger.new(STDOUT)
20
+ LOG.level = Logger::WARN
21
+ end
22
+
23
+ require 'klepto/version'
24
+ require 'klepto/crawler'
25
+ require 'klepto/browser'
26
+ require 'klepto/bot'
@@ -0,0 +1,59 @@
1
+ module Klepto
2
+ class Bot
3
+ def initialize(*args, &block)
4
+ @syntax = :css
5
+ @is_dry_run = false
6
+ @urls = []
7
+ @crawlers = []
8
+ @browser = Klepto::Browser.new
9
+ Docile.dsl_eval(self, &block) if block_given?
10
+ end
11
+
12
+ attr_reader :browser, :crawlers
13
+
14
+ def dry_run!
15
+ @is_dry_run = true
16
+ end
17
+
18
+ def dry_run?
19
+ !!@is_dry_run
20
+ end
21
+
22
+ def syntax(kind=nil)
23
+ @syntax = kind unless kind.nil?
24
+ @syntax
25
+ end
26
+
27
+ def headers(_headers)
28
+ @browser.set_headers(_headers)
29
+ end
30
+
31
+ def url(*args)
32
+ @urls += args
33
+ end
34
+ alias :urls :url
35
+
36
+ def crawl(scope, options={}, &block)
37
+ options[:syntax] = @syntax
38
+ @crawlers << Klepto::Crawler.new(scope, options, &block)
39
+ end
40
+
41
+ def start!
42
+ @urls.each do |url|
43
+ browser.fetch!(url)
44
+ @crawlers.each do |crawler|
45
+ crawler.crawl browser.page
46
+ end
47
+ end
48
+
49
+ @crawlers.each do |crawler|
50
+ if dry_run?
51
+ pp crawler.resources
52
+ else
53
+ crawler.persist!
54
+ end
55
+ end
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,18 @@
1
+ module Klepto
2
+ class Browser
3
+ include Capybara::DSL
4
+
5
+ def initialize(*args)
6
+ super
7
+ end
8
+
9
+ def set_headers(headers)
10
+ page.driver.headers = headers
11
+ end
12
+
13
+ def fetch!(url)
14
+ visit url
15
+ page
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,72 @@
1
+ require 'docile'
2
+ module Klepto
3
+ class Crawler
4
+
5
+ def initialize(scope,options={},&block)
6
+ @resources = []
7
+ @limit = options[:limit]
8
+ @skip = options[:skip]
9
+ @syntax = options[:syntax]
10
+ @scope = scope
11
+ @designations = []
12
+
13
+ Docile.dsl_eval(self, &block) if block_given?
14
+ end
15
+ attr_accessor :resources
16
+ attr_reader :scope, :syntax
17
+
18
+ def scrape(selector=nil, assignee=nil, &block)
19
+ raise Exception if assignee.nil? && !block_given?
20
+ raise Exception if !assignee.nil? && block_given?
21
+ designate(:first, selector, assignee, &block)
22
+ end
23
+
24
+ def scrape_all(selector, assignee=nil, &block)
25
+ raise Exception if assignee.nil? && !block_given?
26
+ raise Exception if !assignee.nil? && block_given?
27
+ designate(:all, selector, assignee, &block)
28
+ end
29
+
30
+ def save(&block)
31
+ @resource_handler = block
32
+ end
33
+
34
+ def crawl(page)
35
+ page.all(syntax, scope).each do |selection|
36
+ params = {}
37
+ @designations.each do |first_or_all, selector, assignee, handler|
38
+ if selector.nil?
39
+ attribs = handler.call selection
40
+ params.merge!( attribs )
41
+ elsif first_or_all == :first
42
+ node = selection.first(syntax, selector)
43
+ if assignee
44
+ params[assignee] = node.try(:text)
45
+ else
46
+ attribs = handler.call node
47
+ params.merge!( attribs )
48
+ end
49
+ else
50
+ nodes = selection.all(syntax, selector)
51
+ attribs = handler.call nodes
52
+ params.merge!( attribs )
53
+ end
54
+ end
55
+ @resources << params
56
+ end
57
+
58
+ @resources
59
+ end
60
+
61
+ def persist!
62
+ if @resource_handler
63
+ @resources.each {|resource| @resource_handler.call(resource)}
64
+ end
65
+ end
66
+
67
+ protected
68
+ def designate(count, selector, assignee, &block)
69
+ @designations << [count, selector, assignee, block]
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,15 @@
1
+ # require this file to load the tasks
2
+ require 'rake'
3
+
4
+ # noop
5
+ =begin
6
+ This is here as a start point for adding rake tasks that can be 'required' by another project
7
+ Just add: require 'klepto/tasks' to your Rakefile
8
+ =end
9
+
10
+ namespace :klepto do
11
+ desc "Example task"
12
+ task :example do
13
+ puts "I'm a task"
14
+ end
15
+ end