scrapula 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +3 -0
  3. data/.rspec +1 -0
  4. data/.simplecov +1 -0
  5. data/CHANGELOG.md +15 -0
  6. data/CONTRIBUTING.md +0 -0
  7. data/Gemfile +24 -0
  8. data/Gemfile.lock +127 -0
  9. data/Guardfile +12 -0
  10. data/LICENSE +21 -0
  11. data/README.md +108 -0
  12. data/ROADMAP.md +42 -0
  13. data/Rakefile +30 -0
  14. data/examples/block_syntax.rb +20 -0
  15. data/examples/find_nodes.rb +6 -0
  16. data/examples/get_first_and_scrape_later.rb +13 -0
  17. data/examples/metas.rb +32 -0
  18. data/examples/more_api.rb +17 -0
  19. data/examples/nested_results.rb +14 -0
  20. data/examples/one_liners.rb +9 -0
  21. data/examples/posting_data.rb +7 -0
  22. data/examples/s.rb +24 -0
  23. data/examples/validation.rb +40 -0
  24. data/lib/scrapula.rb +47 -0
  25. data/lib/scrapula/_old_scraper.rb +110 -0
  26. data/lib/scrapula/agent.rb +8 -0
  27. data/lib/scrapula/data.rb +18 -0
  28. data/lib/scrapula/page.rb +109 -0
  29. data/lib/scrapula/page/meta.rb +74 -0
  30. data/lib/scrapula/request.rb +44 -0
  31. data/lib/scrapula/s.rb +21 -0
  32. data/lib/scrapula/scraper.rb +56 -0
  33. data/lib/scrapula/version.rb +3 -0
  34. data/scrapula.gemspec +36 -0
  35. data/spec/cassettes/Scrapula_Page_Meta/_.yml +748 -0
  36. data/spec/cassettes/Scrapula_Page_Meta/_/Open_Graph.yml +322 -0
  37. data/spec/cassettes/Scrapula_Page_Meta/_/other_names.yml +586 -0
  38. data/spec/cassettes/Scrapula_Page_Meta/_/standard_names.yml +429 -0
  39. data/spec/lib/scrapula/agent_spec.rb +6 -0
  40. data/spec/lib/scrapula/data_spec.rb +19 -0
  41. data/spec/lib/scrapula/page/meta_spec.rb +89 -0
  42. data/spec/lib/scrapula/page_spec.rb +136 -0
  43. data/spec/lib/scrapula/request_spec.rb +91 -0
  44. data/spec/lib/scrapula/s_spec.rb +44 -0
  45. data/spec/lib/scrapula/scraper_spec.rb +205 -0
  46. data/spec/lib/scrapula_spec.rb +141 -0
  47. data/spec/spec_helper.rb +26 -0
  48. metadata +118 -0
@@ -0,0 +1,32 @@
1
+ # `metas` == `meta`
2
+
3
+ Scrapula.get('reddit.com').metas
4
+ Scrapula.metas('reddit.com')
5
+
6
+ Scrapula.get('reddit.com').meta[:description]
7
+ Scrapula.get('reddit.com').meta.description
8
+
9
+ Scrapula.meta('reddit.com')[:description]
10
+ Scrapula.meta('reddit.com').description
11
+
12
+ Scrapula.get('github.com').meta.og
13
+ Scrapula.meta('github.com').og
14
+
15
+ Scrapula.get('github.com').metas.twitter
16
+ Scrapula.metas('github.com').twitter
17
+ Scrapula.metas('github.com')['twitter:site']
18
+
19
+ # Query?
20
+ Scrapula.metas('github.com')[/twit/]
21
+
22
+ # Custom ~"namespaces"
23
+ Scrapula.metas('page.com').ns
24
+
25
+ # [] and each methods ?
26
+ Scrapula.metas('page.com').each do |meta|
27
+ end
28
+
29
+ # TODO to_a ?
30
+ processed_metas = Scrapula.metas('page.com').to_a.map do |meta|
31
+ meta.upcase
32
+ end
@@ -0,0 +1,17 @@
1
+ require_relative '../lib/scrapula'
2
+
3
+ request = Scrapula::Request.new url: 'http://gitlab.com', method: :get
4
+ page = request.execute
5
+
6
+ page_content = Scrapula::Scraper.new page do
7
+ lol '#lol'
8
+ more '#more'
9
+ examples '.examples'
10
+ end
11
+
12
+ # This is the equivalent to the previous code
13
+ page_content = page.scrape do
14
+ lol '#lol'
15
+ more '#more'
16
+ examples '.examples'
17
+ end
@@ -0,0 +1,14 @@
1
+ require_relative '../lib/scrapula'
2
+
3
+ posts = Scrapula.get 'reddit.com' do
4
+ # TODO relative
5
+ posts '.posts' do
6
+ title '.title'
7
+
8
+ comments '.comment' do
9
+ author '.author'
10
+ end
11
+ end
12
+ end
13
+
14
+ puts posts
@@ -0,0 +1,9 @@
1
+ require_relative '../lib/scrapula'
2
+
3
+ lol_node = Scrapula.get('lol.com').at!('#lol')
4
+
5
+ # It is possible to use the methods without "!"; in block syntax it's not
6
+ lol_node = Scrapula.get('lol.com').at('#lol')
7
+ lol_results = Scrapula.get('lol.com') do
8
+ at! '#lol'
9
+ end
@@ -0,0 +1,7 @@
1
+ require_relative '../lib/scrapula'
2
+
3
+ # TODO
4
+ page = Scrapula.post 'duckduckgo.com', {}
5
+ h1 = page.at! 'h1'
6
+
7
+ puts h1
@@ -0,0 +1,24 @@
1
+ require_relative '../lib/scrapula/s'
2
+
3
+
4
+ lol_node = S('lol.com').at!('#lol')
5
+
6
+ lol_content = S('lol.com').text!('#lol')
7
+
8
+ array_of_content = S('lol.com').text!({ lol: '#lol', xd: '#xD' })
9
+
10
+ S 'http://google.com' do |page|
11
+ # result page.('example')
12
+ end
13
+
14
+ S 'http://google.com' do
15
+ # result 'example'
16
+ end
17
+
18
+ S.get 'http://google.com' do
19
+ # result 'example'
20
+ end
21
+
22
+ S.post 'http://google.com' do
23
+ # result 'example'
24
+ end
@@ -0,0 +1,40 @@
1
+ require_relative '../lib/scrapula'
2
+
3
+ Scrapula.get "http://example.com/?q=#{params}" do
4
+ number css: '#container ul > active.li' do |node, resource|
5
+ node[/([\d,]+)\s*results?/i, 1].gsub(',', '').to_i
6
+ end
7
+ end
8
+
9
+ Scrapula.get "http://example.com/?q=#{params}" do
10
+ number css: '#container ul > active.li' do |node, resource|
11
+ node[/([\d,]+)\s*results?/i, 1].gsub(',', '').to_i
12
+ end
13
+
14
+ number = css '#container ul > active.li' do |node, resource|
15
+ node[/([\d,]+)\s*results?/i, 1].gsub(',', '').to_i
16
+ end
17
+
18
+ number = css '#container .number', [:to_s, :strip, :to_i]
19
+
20
+ number css! '#container .number', [:to_s, :strip, :to_i]
21
+ number xpath! '//a', [:to_s, :strip, :to_i]
22
+
23
+ number { css: '#container .number' }, [:to_s, :strip, :to_i]
24
+ number { xpath: '//a' }, [:to_s, :strip, :to_i]
25
+
26
+ # Infer query type
27
+ number '#container .number', [:to_s, :strip, :to_i]
28
+
29
+ number do
30
+ validates!({
31
+ presence: true,
32
+ numerically: true,
33
+ })
34
+
35
+ rescue! do |error|
36
+ puts 'Error on number'
37
+ end
38
+ end
39
+
40
+ end
@@ -0,0 +1,47 @@
1
+ require_relative './scrapula/version'
2
+ require_relative './scrapula/agent'
3
+ require_relative './scrapula/scraper'
4
+ require_relative './scrapula/request'
5
+ require_relative './scrapula/page'
6
+ require_relative './scrapula/page/meta'
7
+ require_relative './scrapula/data'
8
+
9
+ module Scrapula
10
+
11
+ # TODO benchmark
12
+ # AGENTS = %w[mechanize]
13
+
14
+ # TODO
15
+ HTTP_METHODS = %w[get]
16
+
17
+ # # TODO verbose, logger
18
+ # @verbose, @logger, @agent = false, nil, 'Mechanize'
19
+
20
+ class << self
21
+
22
+ # attr_accessor :verbose, :logger, :default_agent
23
+
24
+ HTTP_METHODS.each do |http_method|
25
+ define_method http_method do |*args, &block|
26
+
27
+ # Prepare the request data
28
+ data = args[0].is_a?(Hash) ? args[0] : { url: args[0], params: args[1] }
29
+ data.merge! method: http_method
30
+
31
+ page = Request.new(data).execute
32
+
33
+ block ? page.scrape(&block) : page
34
+ end
35
+ end
36
+
37
+ def meta url
38
+ get(url).meta!
39
+ end
40
+ alias metas meta
41
+
42
+ # def configure &block
43
+ # end
44
+
45
+ end
46
+
47
+ end
@@ -0,0 +1,110 @@
1
+ module Scrapula
2
+ class Scraper
3
+
4
+ def initialize page, &block
5
+ @page, @data = page, {}
6
+
7
+ if block_given?
8
+ @data
9
+ end
10
+ end
11
+
12
+ # delegate @page
13
+
14
+ def execute &block
15
+ yield @page if block_given?
16
+
17
+ self
18
+ end
19
+
20
+ def data!
21
+ @data
22
+ end
23
+
24
+ private
25
+
26
+ def method_missing name, *args, &block
27
+ @data[name] = @page.txt! *args
28
+ end
29
+
30
+ # # Returns the title of the page
31
+ # def title; go('head > title').text end
32
+ #
33
+ # # Returns the entries of the Meta headers
34
+ # def meta
35
+ # results = {}
36
+ #
37
+ # go 'head > meta' do |metas|
38
+ # metas.each do |meta|
39
+ # name = meta.attributes.first[0]
40
+ #
41
+ # if 'charset' == name
42
+ # value = meta[name]
43
+ # else
44
+ # name = meta[name]
45
+ # value = meta.attributes['content'].value
46
+ #
47
+ # if Scrapula.verbose
48
+ # puts "Unknown meta: #{name}" unless ['http-equiv', 'name', 'property'].include? name
49
+ # end
50
+ # end
51
+ #
52
+ # results[name] = value
53
+ #
54
+ # # block.call(name, value, meta) if block_given?
55
+ # end if metas
56
+ # end
57
+ #
58
+ # results
59
+ # end
60
+ #
61
+ # alias metas meta
62
+ #
63
+ # # Extract text and href attribute from an anchor
64
+ # # If you need more attributes, you can use a block
65
+ # def anchor query, &block
66
+ # go query do |node|
67
+ # block_given? ? block.call(node) : [node.text, node.attr('href').text]
68
+ # end
69
+ # end
70
+ #
71
+ # alias link anchor
72
+ #
73
+ # # Extracts an integer
74
+ # # The default sanitization can admit integers like 4,123 or 345.678.234 FIXME
75
+ # # If you need a complex one, you can use a block
76
+ # def int query, &block
77
+ # go query do |node|
78
+ # block_given? ? block.call(node.text) : node.text.slice(/\d+/).to_i
79
+ # end
80
+ # end
81
+ #
82
+ # # TODO Extracts the data of a table and returns it as an array or hash
83
+ # def table query, &block
84
+ #
85
+ # end
86
+ #
87
+ # # Get nodes (Nokogiri::XML::Node)
88
+ # # It yield the node and an empty hash to the block TODO remove results...
89
+ # def nodes query, &block
90
+ # results = []
91
+ #
92
+ # go query do |nodes|
93
+ # nodes.each do |node|
94
+ # result = {}
95
+ # results << result if yield node, result
96
+ # end if nodes
97
+ # end
98
+ #
99
+ # results
100
+ # end
101
+ #
102
+ # private
103
+ #
104
+ # # Search the page with XPath / CSS query
105
+ # def go query, &block
106
+ # page {|p| block_given? ? block.call(p.search query) : p.search(query) }
107
+ # end
108
+
109
+ end
110
+ end
@@ -0,0 +1,8 @@
1
+ require 'mechanize'
2
+
3
+ module Scrapula
4
+
5
+ class Agent < Mechanize
6
+ end
7
+
8
+ end
@@ -0,0 +1,18 @@
1
+ module Scrapula
2
+ class Data < BasicObject
3
+
4
+ def initialize
5
+ @data = {}
6
+
7
+ # undef_method :execute
8
+ end
9
+
10
+ def execute &block
11
+
12
+ yield @page
13
+
14
+ # block.call
15
+ end
16
+
17
+ end
18
+ end
@@ -0,0 +1,109 @@
1
+ module Scrapula
2
+ class Page
3
+
4
+ def initialize agent_page
5
+ @agent_page = agent_page
6
+ end
7
+
8
+ # TODO
9
+ # def url!
10
+ # @agent_page.url
11
+ # end
12
+
13
+ def scrape &block
14
+ scraper = Scrapula::Scraper.new self, &block
15
+ scraper.data!
16
+ end
17
+
18
+ def meta!
19
+ @meta ||= Meta.new self
20
+ end
21
+ alias metas! meta!
22
+
23
+ # at returns the first one only, but search returns all
24
+ def search! query, operations = [], &block
25
+ result = @agent_page.search query
26
+
27
+ # FIXME on every object
28
+ result = operations.reduce(result) do |tmp, op|
29
+ tmp.__send__ op
30
+ end if result
31
+
32
+ yield result if block_given?
33
+
34
+ result
35
+ end
36
+
37
+ # at returns the first one only, but search returns all
38
+ # TODO merge with search!
39
+ def at! query, operations = [], &block
40
+ result = @agent_page.at query
41
+ result = operations.reduce(result) {|tmp, op| tmp.__send__ op } if result
42
+
43
+ yield result if block_given?
44
+
45
+ result
46
+ end
47
+
48
+ # TODO
49
+ # def attribute! query, name, operations = [], &block
50
+ # end
51
+
52
+ def html! query, operations = [], &block
53
+ operations.unshift :to_html
54
+ at! query, operations, &block
55
+ end
56
+
57
+ def text! query, operations = [], &block
58
+ operations.unshift :text
59
+ at! query, operations, &block
60
+ end
61
+ alias txt! text!
62
+
63
+ # TODO xhtml! (to_xhtml) ?
64
+ # TODO inner_html / inner_text ?
65
+
66
+ def regex! query, re, operations = [], &block
67
+ end
68
+ alias re! regex!
69
+
70
+ def int! query, re, operations = [], &block
71
+ end
72
+
73
+ def decimal! query, operations = [], &block
74
+ end
75
+
76
+ def datetime! query, format, operations = [], &block
77
+ end
78
+
79
+ def date! query, format, operations = [], &block
80
+ end
81
+
82
+ def time! query, format, operations = [], &block
83
+ end
84
+
85
+ # def css! query, operations = [], &block
86
+ # @agent_page.css query
87
+ # end
88
+
89
+ # def xpath! query, operations = [], &block
90
+ # end
91
+
92
+ # TODO
93
+ # def validates!
94
+ # end
95
+
96
+ # TODO?
97
+ # def rescue! error, &block
98
+ # end
99
+
100
+ # TODO title, meta/s, anchors/links, table
101
+ end
102
+
103
+ # Class.new {
104
+ # def regex node_set, regex
105
+ # node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
106
+ # end
107
+ # }.new
108
+
109
+ end