Dynamised 0.1.5 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/dynamised/helpers.rb +2 -2
- data/lib/dynamised/meta.rb +1 -1
- data/lib/dynamised/scraper.rb +28 -5
- data/lib/dynamised/scraper_dsl.rb +31 -10
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 095196c60ad862112370409060962832c85306b2
|
4
|
+
data.tar.gz: d5106d54047da1901bbe2ad0a0894f1245eee38b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eaec3611b7acf56e4864c1c738353ac226a5c81c7927eedb491df7bdb92d5ded7e1b5dfd07284a2cd5cec556cb083af9abad22c6d7ca9c47273634a6fd143d67
|
7
|
+
data.tar.gz: cef11f2de483ad33a97986c18a6b770ba30c3c0becc3beb700a0b2a2abba364e4f530f576466cd8af8de857c9a615a21fd4cfe6e3c45996fd73c161028606a40
|
data/lib/dynamised/helpers.rb
CHANGED
@@ -5,8 +5,8 @@ module Dynamised
|
|
5
5
|
Nokogiri::HTML(html)
|
6
6
|
end
|
7
7
|
|
8
|
-
def
|
9
|
-
html_listing.xpath(".%s" %
|
8
|
+
def crawl(html_listing)
|
9
|
+
html_listing.xpath(".%s" % get_crawl_tag[:path]).attr('href').to_s
|
10
10
|
end
|
11
11
|
|
12
12
|
def mpc(doc)
|
data/lib/dynamised/meta.rb
CHANGED
data/lib/dynamised/scraper.rb
CHANGED
@@ -100,15 +100,38 @@ module Dynamised
|
|
100
100
|
if fields?(tree)
|
101
101
|
scrape(doc,tree,&block)
|
102
102
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
scrape_tag_set(doc,spt[:xpath],spt[:meta]) do |url,i|
|
107
|
-
pull(get_doc(segment?(url)),sub_tr||node,&block)
|
103
|
+
if pagination?(doc,tree)
|
104
|
+
paginate(tree) do |item|
|
105
|
+
pull(item,tree,&block)
|
108
106
|
end
|
107
|
+
else
|
108
|
+
childs(tree) do |pos,node,sub_tr|
|
109
|
+
@current_child = node
|
110
|
+
spt = node.data[:meta][:crawl_tag]
|
111
|
+
scrape_tag_set(doc,spt[:xpath],spt[:meta]) do |url,i|
|
112
|
+
pull(get_doc(segment?(url)),sub_tr||node,&block)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def paginate(doc,tree)
|
119
|
+
current_page = doc
|
120
|
+
max = scrape_tag(current_page,tree[:paginate][:max],{r_type: :to_i})
|
121
|
+
raise "No paginate max tag found" unless max
|
122
|
+
(1..max).each do
|
123
|
+
(current_page.xpath(tree[:paginate][:item])).each do |node|
|
124
|
+
yield(item)
|
125
|
+
end
|
126
|
+
current_page = get_doc(current_page.xpath(tree[:paginate][:next]).attr('href'))
|
109
127
|
end
|
110
128
|
end
|
111
129
|
|
130
|
+
def pagination?(doc,tree)
|
131
|
+
search_for_tag(doc,tree[:paginate][:if])
|
132
|
+
end
|
133
|
+
|
134
|
+
|
112
135
|
def segment?(url)
|
113
136
|
url =~ /http/ ? url : "%s/%s" % [@base_url.gsub(/\/$|\z/,''), url.gsub(/\A\//,'')]
|
114
137
|
end
|
@@ -8,6 +8,7 @@ module Dynamised
|
|
8
8
|
@tree = Node.new({
|
9
9
|
fields: {},
|
10
10
|
meta: {},
|
11
|
+
paginate: {},
|
11
12
|
recursive_select: false,
|
12
13
|
select: false,
|
13
14
|
scrape_if: nil
|
@@ -48,26 +49,42 @@ module Dynamised
|
|
48
49
|
def xpath_prefix(prefix,&block)
|
49
50
|
check_for_block(&block)
|
50
51
|
@xpath_prefix << prefix
|
51
|
-
|
52
|
+
block.call
|
52
53
|
@xpath_prefix.pop
|
53
54
|
end
|
54
55
|
|
55
56
|
|
56
57
|
def scrape_here_if(args=nil,&block)
|
57
|
-
|
58
|
+
at_p.data[:scrape_if] = args || {block: block}
|
59
|
+
end
|
60
|
+
|
61
|
+
def select_crawl
|
62
|
+
at_p.data[:select] = true
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
def pag_if(check)
|
67
|
+
at_p.data[:paginate][:if] = check
|
58
68
|
end
|
59
69
|
|
60
|
-
def
|
61
|
-
|
70
|
+
def pag_next(xpath)
|
71
|
+
at_p.data[:paginate][:next] = xpath
|
62
72
|
end
|
63
73
|
|
74
|
+
def pag_inc(xpath)
|
75
|
+
at_p.data[:paginate][:inc] = xpath
|
76
|
+
end
|
77
|
+
|
78
|
+
def pag_item(xpath)
|
79
|
+
at_p.data[:paginate][:item] = xpath
|
80
|
+
end
|
64
81
|
|
65
|
-
|
66
|
-
def
|
82
|
+
|
83
|
+
def crawl(items,&block)
|
67
84
|
items.each do |item,path|
|
68
|
-
|
85
|
+
at_p.new_child(item)
|
69
86
|
tree_down(item) do
|
70
|
-
set_meta_tag(:
|
87
|
+
set_meta_tag(:crawl_tag,join_xpath(path),{attr: [:attr,:href]})
|
71
88
|
block.call
|
72
89
|
end
|
73
90
|
end
|
@@ -82,12 +99,16 @@ module Dynamised
|
|
82
99
|
set_info(:meta,name,xpath,meta)
|
83
100
|
end
|
84
101
|
|
85
|
-
def writer(writers)
|
86
|
-
@writer = writers
|
102
|
+
def writer(writers=nil,&block)
|
103
|
+
@writer = writers || block
|
87
104
|
end
|
88
105
|
|
89
106
|
private
|
90
107
|
|
108
|
+
def at_p
|
109
|
+
@tree[@tree_pointer]
|
110
|
+
end
|
111
|
+
|
91
112
|
def check_for_block(&block)
|
92
113
|
raise "No block given for #%s" % caller[0][/`.*'/][1..-2] unless block_given?
|
93
114
|
end
|