Dynamised 0.1.5 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/dynamised/helpers.rb +2 -2
- data/lib/dynamised/meta.rb +1 -1
- data/lib/dynamised/scraper.rb +28 -5
- data/lib/dynamised/scraper_dsl.rb +31 -10
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 095196c60ad862112370409060962832c85306b2
|
4
|
+
data.tar.gz: d5106d54047da1901bbe2ad0a0894f1245eee38b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eaec3611b7acf56e4864c1c738353ac226a5c81c7927eedb491df7bdb92d5ded7e1b5dfd07284a2cd5cec556cb083af9abad22c6d7ca9c47273634a6fd143d67
|
7
|
+
data.tar.gz: cef11f2de483ad33a97986c18a6b770ba30c3c0becc3beb700a0b2a2abba364e4f530f576466cd8af8de857c9a615a21fd4cfe6e3c45996fd73c161028606a40
|
data/lib/dynamised/helpers.rb
CHANGED
@@ -5,8 +5,8 @@ module Dynamised
|
|
5
5
|
Nokogiri::HTML(html)
|
6
6
|
end
|
7
7
|
|
8
|
-
def
|
9
|
-
html_listing.xpath(".%s" %
|
8
|
+
def crawl(html_listing)
|
9
|
+
html_listing.xpath(".%s" % get_crawl_tag[:path]).attr('href').to_s
|
10
10
|
end
|
11
11
|
|
12
12
|
def mpc(doc)
|
data/lib/dynamised/meta.rb
CHANGED
data/lib/dynamised/scraper.rb
CHANGED
@@ -100,15 +100,38 @@ module Dynamised
|
|
100
100
|
if fields?(tree)
|
101
101
|
scrape(doc,tree,&block)
|
102
102
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
scrape_tag_set(doc,spt[:xpath],spt[:meta]) do |url,i|
|
107
|
-
pull(get_doc(segment?(url)),sub_tr||node,&block)
|
103
|
+
if pagination?(doc,tree)
|
104
|
+
paginate(tree) do |item|
|
105
|
+
pull(item,tree,&block)
|
108
106
|
end
|
107
|
+
else
|
108
|
+
childs(tree) do |pos,node,sub_tr|
|
109
|
+
@current_child = node
|
110
|
+
spt = node.data[:meta][:crawl_tag]
|
111
|
+
scrape_tag_set(doc,spt[:xpath],spt[:meta]) do |url,i|
|
112
|
+
pull(get_doc(segment?(url)),sub_tr||node,&block)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def paginate(doc,tree)
|
119
|
+
current_page = doc
|
120
|
+
max = scrape_tag(current_page,tree[:paginate][:max],{r_type: :to_i})
|
121
|
+
raise "No paginate max tag found" unless max
|
122
|
+
(1..max).each do
|
123
|
+
(current_page.xpath(tree[:paginate][:item])).each do |node|
|
124
|
+
yield(item)
|
125
|
+
end
|
126
|
+
current_page = get_doc(current_page.xpath(tree[:paginate][:next]).attr('href'))
|
109
127
|
end
|
110
128
|
end
|
111
129
|
|
130
|
+
def pagination?(doc,tree)
|
131
|
+
search_for_tag(doc,tree[:paginate][:if])
|
132
|
+
end
|
133
|
+
|
134
|
+
|
112
135
|
def segment?(url)
|
113
136
|
url =~ /http/ ? url : "%s/%s" % [@base_url.gsub(/\/$|\z/,''), url.gsub(/\A\//,'')]
|
114
137
|
end
|
@@ -8,6 +8,7 @@ module Dynamised
|
|
8
8
|
@tree = Node.new({
|
9
9
|
fields: {},
|
10
10
|
meta: {},
|
11
|
+
paginate: {},
|
11
12
|
recursive_select: false,
|
12
13
|
select: false,
|
13
14
|
scrape_if: nil
|
@@ -48,26 +49,42 @@ module Dynamised
|
|
48
49
|
def xpath_prefix(prefix,&block)
|
49
50
|
check_for_block(&block)
|
50
51
|
@xpath_prefix << prefix
|
51
|
-
|
52
|
+
block.call
|
52
53
|
@xpath_prefix.pop
|
53
54
|
end
|
54
55
|
|
55
56
|
|
56
57
|
def scrape_here_if(args=nil,&block)
|
57
|
-
|
58
|
+
at_p.data[:scrape_if] = args || {block: block}
|
59
|
+
end
|
60
|
+
|
61
|
+
def select_crawl
|
62
|
+
at_p.data[:select] = true
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
def pag_if(check)
|
67
|
+
at_p.data[:paginate][:if] = check
|
58
68
|
end
|
59
69
|
|
60
|
-
def
|
61
|
-
|
70
|
+
def pag_next(xpath)
|
71
|
+
at_p.data[:paginate][:next] = xpath
|
62
72
|
end
|
63
73
|
|
74
|
+
def pag_inc(xpath)
|
75
|
+
at_p.data[:paginate][:inc] = xpath
|
76
|
+
end
|
77
|
+
|
78
|
+
def pag_item(xpath)
|
79
|
+
at_p.data[:paginate][:item] = xpath
|
80
|
+
end
|
64
81
|
|
65
|
-
|
66
|
-
def
|
82
|
+
|
83
|
+
def crawl(items,&block)
|
67
84
|
items.each do |item,path|
|
68
|
-
|
85
|
+
at_p.new_child(item)
|
69
86
|
tree_down(item) do
|
70
|
-
set_meta_tag(:
|
87
|
+
set_meta_tag(:crawl_tag,join_xpath(path),{attr: [:attr,:href]})
|
71
88
|
block.call
|
72
89
|
end
|
73
90
|
end
|
@@ -82,12 +99,16 @@ module Dynamised
|
|
82
99
|
set_info(:meta,name,xpath,meta)
|
83
100
|
end
|
84
101
|
|
85
|
-
def writer(writers)
|
86
|
-
@writer = writers
|
102
|
+
def writer(writers=nil,&block)
|
103
|
+
@writer = writers || block
|
87
104
|
end
|
88
105
|
|
89
106
|
private
|
90
107
|
|
108
|
+
def at_p
|
109
|
+
@tree[@tree_pointer]
|
110
|
+
end
|
111
|
+
|
91
112
|
def check_for_block(&block)
|
92
113
|
raise "No block given for #%s" % caller[0][/`.*'/][1..-2] unless block_given?
|
93
114
|
end
|