Dynamised 0.1.5 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a4382496b1899c1a709a2b10a4dc5e7a6dff1a4b
4
- data.tar.gz: dbc87478bbb52fd93519af0c5c356772d1d5a7e9
3
+ metadata.gz: 095196c60ad862112370409060962832c85306b2
4
+ data.tar.gz: d5106d54047da1901bbe2ad0a0894f1245eee38b
5
5
  SHA512:
6
- metadata.gz: da07328d22fe81ec98958bc3e683de00afa0707172176d7ea016a3a83dadc899bfe55b9396c3a25e235a105dc35b8eb157ae0b11263b74f9bd57e04fb4f01c3f
7
- data.tar.gz: ee82ed7f95e34dc21e491975b80b9881c3cf09cf05548f549b968fc1e2492fc6511e157fc62213dc3659a5ed0a24392d6e447101ce327bf36c72eb087b2045be
6
+ metadata.gz: eaec3611b7acf56e4864c1c738353ac226a5c81c7927eedb491df7bdb92d5ded7e1b5dfd07284a2cd5cec556cb083af9abad22c6d7ca9c47273634a6fd143d67
7
+ data.tar.gz: cef11f2de483ad33a97986c18a6b770ba30c3c0becc3beb700a0b2a2abba364e4f530f576466cd8af8de857c9a615a21fd4cfe6e3c45996fd73c161028606a40
@@ -5,8 +5,8 @@ module Dynamised
5
5
  Nokogiri::HTML(html)
6
6
  end
7
7
 
8
- def sub_page(html_listing)
9
- html_listing.xpath(".%s" % get_sub_page_tag[:path]).attr('href').to_s
8
+ def crawl(html_listing)
9
+ html_listing.xpath(".%s" % get_crawl_tag[:path]).attr('href').to_s
10
10
  end
11
11
 
12
12
  def mpc(doc)
@@ -1,6 +1,6 @@
1
1
  module Dynamised
2
2
  module META
3
- Version = "0.1.5"
3
+ Version = "0.2.0"
4
4
  Description = <<-DESC.gsub(/^\s*/, '')
5
5
  A tool that allows a user to build a web scraper that works by recursively crawling pages until
6
6
  it finds the requested infomation.
@@ -100,15 +100,38 @@ module Dynamised
100
100
  if fields?(tree)
101
101
  scrape(doc,tree,&block)
102
102
  end
103
- childs(tree) do |pos,node,sub_tr|
104
- @current_child = node
105
- spt = node.data[:meta][:sub_page_tag]
106
- scrape_tag_set(doc,spt[:xpath],spt[:meta]) do |url,i|
107
- pull(get_doc(segment?(url)),sub_tr||node,&block)
103
+ if pagination?(doc,tree)
104
+ paginate(tree) do |item|
105
+ pull(item,tree,&block)
108
106
  end
107
+ else
108
+ childs(tree) do |pos,node,sub_tr|
109
+ @current_child = node
110
+ spt = node.data[:meta][:crawl_tag]
111
+ scrape_tag_set(doc,spt[:xpath],spt[:meta]) do |url,i|
112
+ pull(get_doc(segment?(url)),sub_tr||node,&block)
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+ def paginate(doc,tree)
119
+ current_page = doc
120
+ max = scrape_tag(current_page,tree[:paginate][:max],{r_type: :to_i})
121
+ raise "No paginate max tag found" unless max
122
+ (1..max).each do
123
+ (current_page.xpath(tree[:paginate][:item])).each do |node|
124
+ yield(item)
125
+ end
126
+ current_page = get_doc(current_page.xpath(tree[:paginate][:next]).attr('href'))
109
127
  end
110
128
  end
111
129
 
130
+ def pagination?(doc,tree)
131
+ search_for_tag(doc,tree[:paginate][:if])
132
+ end
133
+
134
+
112
135
  def segment?(url)
113
136
  url =~ /http/ ? url : "%s/%s" % [@base_url.gsub(/\/$|\z/,''), url.gsub(/\A\//,'')]
114
137
  end
@@ -8,6 +8,7 @@ module Dynamised
8
8
  @tree = Node.new({
9
9
  fields: {},
10
10
  meta: {},
11
+ paginate: {},
11
12
  recursive_select: false,
12
13
  select: false,
13
14
  scrape_if: nil
@@ -48,26 +49,42 @@ module Dynamised
48
49
  def xpath_prefix(prefix,&block)
49
50
  check_for_block(&block)
50
51
  @xpath_prefix << prefix
51
- yield
52
+ block.call
52
53
  @xpath_prefix.pop
53
54
  end
54
55
 
55
56
 
56
57
  def scrape_here_if(args=nil,&block)
57
- @tree[@tree_pointer].data[:scrape_if] = args || {block: block}
58
+ at_p.data[:scrape_if] = args || {block: block}
59
+ end
60
+
61
+ def select_crawl
62
+ at_p.data[:select] = true
63
+ end
64
+
65
+
66
+ def pag_if(check)
67
+ at_p.data[:paginate][:if] = check
58
68
  end
59
69
 
60
- def select_sub_page
61
- @tree[@tree_pointer].data[:select] = true
70
+ def pag_next(xpath)
71
+ at_p.data[:paginate][:next] = xpath
62
72
  end
63
73
 
74
+ def pag_inc(xpath)
75
+ at_p.data[:paginate][:inc] = xpath
76
+ end
77
+
78
+ def pag_item(xpath)
79
+ at_p.data[:paginate][:item] = xpath
80
+ end
64
81
 
65
- #recursivly drill into page
66
- def sub_page(items,&block)
82
+
83
+ def crawl(items,&block)
67
84
  items.each do |item,path|
68
- @tree[@tree_pointer].new_child(item)
85
+ at_p.new_child(item)
69
86
  tree_down(item) do
70
- set_meta_tag(:sub_page_tag,join_xpath(path),{attr: [:attr,:href]})
87
+ set_meta_tag(:crawl_tag,join_xpath(path),{attr: [:attr,:href]})
71
88
  block.call
72
89
  end
73
90
  end
@@ -82,12 +99,16 @@ module Dynamised
82
99
  set_info(:meta,name,xpath,meta)
83
100
  end
84
101
 
85
- def writer(writers)
86
- @writer = writers
102
+ def writer(writers=nil,&block)
103
+ @writer = writers || block
87
104
  end
88
105
 
89
106
  private
90
107
 
108
+ def at_p
109
+ @tree[@tree_pointer]
110
+ end
111
+
91
112
  def check_for_block(&block)
92
113
  raise "No block given for #%s" % caller[0][/`.*'/][1..-2] unless block_given?
93
114
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: Dynamised
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Becker