scrapula 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/.rspec +1 -0
- data/.simplecov +1 -0
- data/CHANGELOG.md +15 -0
- data/CONTRIBUTING.md +0 -0
- data/Gemfile +24 -0
- data/Gemfile.lock +127 -0
- data/Guardfile +12 -0
- data/LICENSE +21 -0
- data/README.md +108 -0
- data/ROADMAP.md +42 -0
- data/Rakefile +30 -0
- data/examples/block_syntax.rb +20 -0
- data/examples/find_nodes.rb +6 -0
- data/examples/get_first_and_scrape_later.rb +13 -0
- data/examples/metas.rb +32 -0
- data/examples/more_api.rb +17 -0
- data/examples/nested_results.rb +14 -0
- data/examples/one_liners.rb +9 -0
- data/examples/posting_data.rb +7 -0
- data/examples/s.rb +24 -0
- data/examples/validation.rb +40 -0
- data/lib/scrapula.rb +47 -0
- data/lib/scrapula/_old_scraper.rb +110 -0
- data/lib/scrapula/agent.rb +8 -0
- data/lib/scrapula/data.rb +18 -0
- data/lib/scrapula/page.rb +109 -0
- data/lib/scrapula/page/meta.rb +74 -0
- data/lib/scrapula/request.rb +44 -0
- data/lib/scrapula/s.rb +21 -0
- data/lib/scrapula/scraper.rb +56 -0
- data/lib/scrapula/version.rb +3 -0
- data/scrapula.gemspec +36 -0
- data/spec/cassettes/Scrapula_Page_Meta/_.yml +748 -0
- data/spec/cassettes/Scrapula_Page_Meta/_/Open_Graph.yml +322 -0
- data/spec/cassettes/Scrapula_Page_Meta/_/other_names.yml +586 -0
- data/spec/cassettes/Scrapula_Page_Meta/_/standard_names.yml +429 -0
- data/spec/lib/scrapula/agent_spec.rb +6 -0
- data/spec/lib/scrapula/data_spec.rb +19 -0
- data/spec/lib/scrapula/page/meta_spec.rb +89 -0
- data/spec/lib/scrapula/page_spec.rb +136 -0
- data/spec/lib/scrapula/request_spec.rb +91 -0
- data/spec/lib/scrapula/s_spec.rb +44 -0
- data/spec/lib/scrapula/scraper_spec.rb +205 -0
- data/spec/lib/scrapula_spec.rb +141 -0
- data/spec/spec_helper.rb +26 -0
- metadata +118 -0
data/examples/metas.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# `metas` == `meta`
|
2
|
+
|
3
|
+
Scrapula.get('reddit.com').metas
|
4
|
+
Scrapula.metas('reddit.com')
|
5
|
+
|
6
|
+
Scrapula.get('reddit.com').meta[:description]
|
7
|
+
Scrapula.get('reddit.com').meta.description
|
8
|
+
|
9
|
+
Scrapula.meta('reddit.com')[:description]
|
10
|
+
Scrapula.meta('reddit.com').description
|
11
|
+
|
12
|
+
Scrapula.get('github.com').meta.og
|
13
|
+
Scrapula.meta('github.com').og
|
14
|
+
|
15
|
+
Scrapula.get('github.com').metas.twitter
|
16
|
+
Scrapula.metas('github.com').twitter
|
17
|
+
Scrapula.metas('github.com')['twitter:site']
|
18
|
+
|
19
|
+
# Query?
|
20
|
+
Scrapula.metas('github.com')[/twit/]
|
21
|
+
|
22
|
+
# Custom ~"namespaces"
|
23
|
+
Scrapula.metas('page.com').ns
|
24
|
+
|
25
|
+
# [] and each methods ?
|
26
|
+
Scrapula.metas('page.com').each do |meta|
|
27
|
+
end
|
28
|
+
|
29
|
+
# TODO to_a ?
|
30
|
+
processed_metas = Scrapula.metas('page.com').to_a.map do |meta|
|
31
|
+
meta.upcase
|
32
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require_relative '../lib/scrapula'
|
2
|
+
|
3
|
+
request = Scrapula::Request.new url: 'http://gitlab.com', method: :get
|
4
|
+
page = request.execute
|
5
|
+
|
6
|
+
page_content = Scrapula::Scraper.new page do
|
7
|
+
lol '#lol'
|
8
|
+
more '#more'
|
9
|
+
examples '.examples'
|
10
|
+
end
|
11
|
+
|
12
|
+
# This is the equivalent to the previous code
|
13
|
+
page_content = page.scrape do
|
14
|
+
lol '#lol'
|
15
|
+
more '#more'
|
16
|
+
examples '.examples'
|
17
|
+
end
|
@@ -0,0 +1,9 @@
|
|
1
|
+
require_relative '../lib/scrapula'
|
2
|
+
|
3
|
+
lol_node = Scrapula.get('lol.com').at!('#lol')
|
4
|
+
|
5
|
+
# It is possible to use the methods without "!"; in block syntax it's not
|
6
|
+
lol_node = Scrapula.get('lol.com').at('#lol')
|
7
|
+
lol_results = Scrapula.get('lol.com') do
|
8
|
+
at! '#lol'
|
9
|
+
end
|
data/examples/s.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
require_relative '../lib/scrapula/s'
|
2
|
+
|
3
|
+
|
4
|
+
lol_node = S('lol.com').at!('#lol')
|
5
|
+
|
6
|
+
lol_content = S('lol.com').text!('#lol')
|
7
|
+
|
8
|
+
array_of_content = S('lol.com').text!({ lol: '#lol', xd: '#xD' })
|
9
|
+
|
10
|
+
S 'http://google.com' do |page|
|
11
|
+
# result page.('example')
|
12
|
+
end
|
13
|
+
|
14
|
+
S 'http://google.com' do
|
15
|
+
# result 'example'
|
16
|
+
end
|
17
|
+
|
18
|
+
S.get 'http://google.com' do
|
19
|
+
# result 'example'
|
20
|
+
end
|
21
|
+
|
22
|
+
S.post 'http://google.com' do
|
23
|
+
# result 'example'
|
24
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require_relative '../lib/scrapula'
|
2
|
+
|
3
|
+
Scrapula.get "http://example.com/?q=#{params}" do
|
4
|
+
number css: '#container ul > active.li' do |node, resource|
|
5
|
+
node[/([\d,]+)\s*results?/i, 1].gsub(',', '').to_i
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
Scrapula.get "http://example.com/?q=#{params}" do
|
10
|
+
number css: '#container ul > active.li' do |node, resource|
|
11
|
+
node[/([\d,]+)\s*results?/i, 1].gsub(',', '').to_i
|
12
|
+
end
|
13
|
+
|
14
|
+
number = css '#container ul > active.li' do |node, resource|
|
15
|
+
node[/([\d,]+)\s*results?/i, 1].gsub(',', '').to_i
|
16
|
+
end
|
17
|
+
|
18
|
+
number = css '#container .number', [:to_s, :strip, :to_i]
|
19
|
+
|
20
|
+
number css! '#container .number', [:to_s, :strip, :to_i]
|
21
|
+
number xpath! '//a', [:to_s, :strip, :to_i]
|
22
|
+
|
23
|
+
number { css: '#container .number' }, [:to_s, :strip, :to_i]
|
24
|
+
number { xpath: '//a' }, [:to_s, :strip, :to_i]
|
25
|
+
|
26
|
+
# Infer query type
|
27
|
+
number '#container .number', [:to_s, :strip, :to_i]
|
28
|
+
|
29
|
+
number do
|
30
|
+
validates!({
|
31
|
+
presence: true,
|
32
|
+
numerically: true,
|
33
|
+
})
|
34
|
+
|
35
|
+
rescue! do |error|
|
36
|
+
puts 'Error on number'
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
data/lib/scrapula.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
require_relative './scrapula/version'
|
2
|
+
require_relative './scrapula/agent'
|
3
|
+
require_relative './scrapula/scraper'
|
4
|
+
require_relative './scrapula/request'
|
5
|
+
require_relative './scrapula/page'
|
6
|
+
require_relative './scrapula/page/meta'
|
7
|
+
require_relative './scrapula/data'
|
8
|
+
|
9
|
+
module Scrapula
|
10
|
+
|
11
|
+
# TODO benchmark
|
12
|
+
# AGENTS = %w[mechanize]
|
13
|
+
|
14
|
+
# TODO
|
15
|
+
HTTP_METHODS = %w[get]
|
16
|
+
|
17
|
+
# # TODO verbose, logger
|
18
|
+
# @verbose, @logger, @agent = false, nil, 'Mechanize'
|
19
|
+
|
20
|
+
class << self
|
21
|
+
|
22
|
+
# attr_accessor :verbose, :logger, :default_agent
|
23
|
+
|
24
|
+
HTTP_METHODS.each do |http_method|
|
25
|
+
define_method http_method do |*args, &block|
|
26
|
+
|
27
|
+
# Prepare the request data
|
28
|
+
data = args[0].is_a?(Hash) ? args[0] : { url: args[0], params: args[1] }
|
29
|
+
data.merge! method: http_method
|
30
|
+
|
31
|
+
page = Request.new(data).execute
|
32
|
+
|
33
|
+
block ? page.scrape(&block) : page
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def meta url
|
38
|
+
get(url).meta!
|
39
|
+
end
|
40
|
+
alias metas meta
|
41
|
+
|
42
|
+
# def configure &block
|
43
|
+
# end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,110 @@
|
|
1
|
+
module Scrapula
|
2
|
+
class Scraper
|
3
|
+
|
4
|
+
def initialize page, &block
|
5
|
+
@page, @data = page, {}
|
6
|
+
|
7
|
+
if block_given?
|
8
|
+
@data
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
# delegate @page
|
13
|
+
|
14
|
+
def execute &block
|
15
|
+
yield @page if block_given?
|
16
|
+
|
17
|
+
self
|
18
|
+
end
|
19
|
+
|
20
|
+
def data!
|
21
|
+
@data
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def method_missing name, *args, &block
|
27
|
+
@data[name] = @page.txt! *args
|
28
|
+
end
|
29
|
+
|
30
|
+
# # Returns the title of the page
|
31
|
+
# def title; go('head > title').text end
|
32
|
+
#
|
33
|
+
# # Returns the entries of the Meta headers
|
34
|
+
# def meta
|
35
|
+
# results = {}
|
36
|
+
#
|
37
|
+
# go 'head > meta' do |metas|
|
38
|
+
# metas.each do |meta|
|
39
|
+
# name = meta.attributes.first[0]
|
40
|
+
#
|
41
|
+
# if 'charset' == name
|
42
|
+
# value = meta[name]
|
43
|
+
# else
|
44
|
+
# name = meta[name]
|
45
|
+
# value = meta.attributes['content'].value
|
46
|
+
#
|
47
|
+
# if Scrapula.verbose
|
48
|
+
# puts "Unknown meta: #{name}" unless ['http-equiv', 'name', 'property'].include? name
|
49
|
+
# end
|
50
|
+
# end
|
51
|
+
#
|
52
|
+
# results[name] = value
|
53
|
+
#
|
54
|
+
# # block.call(name, value, meta) if block_given?
|
55
|
+
# end if metas
|
56
|
+
# end
|
57
|
+
#
|
58
|
+
# results
|
59
|
+
# end
|
60
|
+
#
|
61
|
+
# alias metas meta
|
62
|
+
#
|
63
|
+
# # Extract text and href attribute from an anchor
|
64
|
+
# # If you need more attributes, you can use a block
|
65
|
+
# def anchor query, &block
|
66
|
+
# go query do |node|
|
67
|
+
# block_given? ? block.call(node) : [node.text, node.attr('href').text]
|
68
|
+
# end
|
69
|
+
# end
|
70
|
+
#
|
71
|
+
# alias link anchor
|
72
|
+
#
|
73
|
+
# # Extracts an integer
|
74
|
+
# # The default sanitization can admit integers like 4,123 or 345.678.234 FIXME
|
75
|
+
# # If you need a complex one, you can use a block
|
76
|
+
# def int query, &block
|
77
|
+
# go query do |node|
|
78
|
+
# block_given? ? block.call(node.text) : node.text.slice(/\d+/).to_i
|
79
|
+
# end
|
80
|
+
# end
|
81
|
+
#
|
82
|
+
# # TODO Extracts the data of a table and returns it as an array or hash
|
83
|
+
# def table query, &block
|
84
|
+
#
|
85
|
+
# end
|
86
|
+
#
|
87
|
+
# # Get nodes (Nokogiri::XML::Node)
|
88
|
+
# # It yield the node and an empty hash to the block TODO remove results...
|
89
|
+
# def nodes query, &block
|
90
|
+
# results = []
|
91
|
+
#
|
92
|
+
# go query do |nodes|
|
93
|
+
# nodes.each do |node|
|
94
|
+
# result = {}
|
95
|
+
# results << result if yield node, result
|
96
|
+
# end if nodes
|
97
|
+
# end
|
98
|
+
#
|
99
|
+
# results
|
100
|
+
# end
|
101
|
+
#
|
102
|
+
# private
|
103
|
+
#
|
104
|
+
# # Search the page with XPath / CSS query
|
105
|
+
# def go query, &block
|
106
|
+
# page {|p| block_given? ? block.call(p.search query) : p.search(query) }
|
107
|
+
# end
|
108
|
+
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module Scrapula
|
2
|
+
class Page
|
3
|
+
|
4
|
+
def initialize agent_page
|
5
|
+
@agent_page = agent_page
|
6
|
+
end
|
7
|
+
|
8
|
+
# TODO
|
9
|
+
# def url!
|
10
|
+
# @agent_page.url
|
11
|
+
# end
|
12
|
+
|
13
|
+
def scrape &block
|
14
|
+
scraper = Scrapula::Scraper.new self, &block
|
15
|
+
scraper.data!
|
16
|
+
end
|
17
|
+
|
18
|
+
def meta!
|
19
|
+
@meta ||= Meta.new self
|
20
|
+
end
|
21
|
+
alias metas! meta!
|
22
|
+
|
23
|
+
# at returns the first one only, but search returns all
|
24
|
+
def search! query, operations = [], &block
|
25
|
+
result = @agent_page.search query
|
26
|
+
|
27
|
+
# FIXME on every object
|
28
|
+
result = operations.reduce(result) do |tmp, op|
|
29
|
+
tmp.__send__ op
|
30
|
+
end if result
|
31
|
+
|
32
|
+
yield result if block_given?
|
33
|
+
|
34
|
+
result
|
35
|
+
end
|
36
|
+
|
37
|
+
# at returns the first one only, but search returns all
|
38
|
+
# TODO merge with search!
|
39
|
+
def at! query, operations = [], &block
|
40
|
+
result = @agent_page.at query
|
41
|
+
result = operations.reduce(result) {|tmp, op| tmp.__send__ op } if result
|
42
|
+
|
43
|
+
yield result if block_given?
|
44
|
+
|
45
|
+
result
|
46
|
+
end
|
47
|
+
|
48
|
+
# TODO
|
49
|
+
# def attribute! query, name, operations = [], &block
|
50
|
+
# end
|
51
|
+
|
52
|
+
def html! query, operations = [], &block
|
53
|
+
operations.unshift :to_html
|
54
|
+
at! query, operations, &block
|
55
|
+
end
|
56
|
+
|
57
|
+
def text! query, operations = [], &block
|
58
|
+
operations.unshift :text
|
59
|
+
at! query, operations, &block
|
60
|
+
end
|
61
|
+
alias txt! text!
|
62
|
+
|
63
|
+
# TODO xhtml! (to_xhtml) ?
|
64
|
+
# TODO inner_html / inner_text ?
|
65
|
+
|
66
|
+
def regex! query, re, operations = [], &block
|
67
|
+
end
|
68
|
+
alias re! regex!
|
69
|
+
|
70
|
+
def int! query, re, operations = [], &block
|
71
|
+
end
|
72
|
+
|
73
|
+
def decimal! query, operations = [], &block
|
74
|
+
end
|
75
|
+
|
76
|
+
def datetime! query, format, operations = [], &block
|
77
|
+
end
|
78
|
+
|
79
|
+
def date! query, format, operations = [], &block
|
80
|
+
end
|
81
|
+
|
82
|
+
def time! query, format, operations = [], &block
|
83
|
+
end
|
84
|
+
|
85
|
+
# def css! query, operations = [], &block
|
86
|
+
# @agent_page.css query
|
87
|
+
# end
|
88
|
+
|
89
|
+
# def xpath! query, operations = [], &block
|
90
|
+
# end
|
91
|
+
|
92
|
+
# TODO
|
93
|
+
# def validates!
|
94
|
+
# end
|
95
|
+
|
96
|
+
# TODO?
|
97
|
+
# def rescue! error, &block
|
98
|
+
# end
|
99
|
+
|
100
|
+
# TODO title, meta/s, anchors/links, table
|
101
|
+
end
|
102
|
+
|
103
|
+
# Class.new {
|
104
|
+
# def regex node_set, regex
|
105
|
+
# node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
|
106
|
+
# end
|
107
|
+
# }.new
|
108
|
+
|
109
|
+
end
|