Dynamised 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a4382496b1899c1a709a2b10a4dc5e7a6dff1a4b
4
- data.tar.gz: dbc87478bbb52fd93519af0c5c356772d1d5a7e9
3
+ metadata.gz: 095196c60ad862112370409060962832c85306b2
4
+ data.tar.gz: d5106d54047da1901bbe2ad0a0894f1245eee38b
5
5
  SHA512:
6
- metadata.gz: da07328d22fe81ec98958bc3e683de00afa0707172176d7ea016a3a83dadc899bfe55b9396c3a25e235a105dc35b8eb157ae0b11263b74f9bd57e04fb4f01c3f
7
- data.tar.gz: ee82ed7f95e34dc21e491975b80b9881c3cf09cf05548f549b968fc1e2492fc6511e157fc62213dc3659a5ed0a24392d6e447101ce327bf36c72eb087b2045be
6
+ metadata.gz: eaec3611b7acf56e4864c1c738353ac226a5c81c7927eedb491df7bdb92d5ded7e1b5dfd07284a2cd5cec556cb083af9abad22c6d7ca9c47273634a6fd143d67
7
+ data.tar.gz: cef11f2de483ad33a97986c18a6b770ba30c3c0becc3beb700a0b2a2abba364e4f530f576466cd8af8de857c9a615a21fd4cfe6e3c45996fd73c161028606a40
@@ -5,8 +5,8 @@ module Dynamised
5
5
  Nokogiri::HTML(html)
6
6
  end
7
7
 
8
- def sub_page(html_listing)
9
- html_listing.xpath(".%s" % get_sub_page_tag[:path]).attr('href').to_s
8
+ def crawl(html_listing)
9
+ html_listing.xpath(".%s" % get_crawl_tag[:path]).attr('href').to_s
10
10
  end
11
11
 
12
12
  def mpc(doc)
@@ -1,6 +1,6 @@
1
1
  module Dynamised
2
2
  module META
3
- Version = "0.1.5"
3
+ Version = "0.2.0"
4
4
  Description = <<-DESC.gsub(/^\s*/, '')
5
5
  A tool that allows a user to build a web scraper that works by recursively crawling pages until
6
6
  it finds the requested infomation.
@@ -100,15 +100,38 @@ module Dynamised
100
100
  if fields?(tree)
101
101
  scrape(doc,tree,&block)
102
102
  end
103
- childs(tree) do |pos,node,sub_tr|
104
- @current_child = node
105
- spt = node.data[:meta][:sub_page_tag]
106
- scrape_tag_set(doc,spt[:xpath],spt[:meta]) do |url,i|
107
- pull(get_doc(segment?(url)),sub_tr||node,&block)
103
+ if pagination?(doc,tree)
104
+ paginate(tree) do |item|
105
+ pull(item,tree,&block)
108
106
  end
107
+ else
108
+ childs(tree) do |pos,node,sub_tr|
109
+ @current_child = node
110
+ spt = node.data[:meta][:crawl_tag]
111
+ scrape_tag_set(doc,spt[:xpath],spt[:meta]) do |url,i|
112
+ pull(get_doc(segment?(url)),sub_tr||node,&block)
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+ def paginate(doc,tree)
119
+ current_page = doc
120
+ max = scrape_tag(current_page,tree[:paginate][:max],{r_type: :to_i})
121
+ raise "No paginate max tag found" unless max
122
+ (1..max).each do
123
+ (current_page.xpath(tree[:paginate][:item])).each do |node|
124
+ yield(item)
125
+ end
126
+ current_page = get_doc(current_page.xpath(tree[:paginate][:next]).attr('href'))
109
127
  end
110
128
  end
111
129
 
130
+ def pagination?(doc,tree)
131
+ search_for_tag(doc,tree[:paginate][:if])
132
+ end
133
+
134
+
112
135
  def segment?(url)
113
136
  url =~ /http/ ? url : "%s/%s" % [@base_url.gsub(/\/$|\z/,''), url.gsub(/\A\//,'')]
114
137
  end
@@ -8,6 +8,7 @@ module Dynamised
8
8
  @tree = Node.new({
9
9
  fields: {},
10
10
  meta: {},
11
+ paginate: {},
11
12
  recursive_select: false,
12
13
  select: false,
13
14
  scrape_if: nil
@@ -48,26 +49,42 @@ module Dynamised
48
49
  def xpath_prefix(prefix,&block)
49
50
  check_for_block(&block)
50
51
  @xpath_prefix << prefix
51
- yield
52
+ block.call
52
53
  @xpath_prefix.pop
53
54
  end
54
55
 
55
56
 
56
57
  def scrape_here_if(args=nil,&block)
57
- @tree[@tree_pointer].data[:scrape_if] = args || {block: block}
58
+ at_p.data[:scrape_if] = args || {block: block}
59
+ end
60
+
61
+ def select_crawl
62
+ at_p.data[:select] = true
63
+ end
64
+
65
+
66
+ def pag_if(check)
67
+ at_p.data[:paginate][:if] = check
58
68
  end
59
69
 
60
- def select_sub_page
61
- @tree[@tree_pointer].data[:select] = true
70
+ def pag_next(xpath)
71
+ at_p.data[:paginate][:next] = xpath
62
72
  end
63
73
 
74
+ def pag_inc(xpath)
75
+ at_p.data[:paginate][:inc] = xpath
76
+ end
77
+
78
+ def pag_item(xpath)
79
+ at_p.data[:paginate][:item] = xpath
80
+ end
64
81
 
65
- #recursivly drill into page
66
- def sub_page(items,&block)
82
+
83
+ def crawl(items,&block)
67
84
  items.each do |item,path|
68
- @tree[@tree_pointer].new_child(item)
85
+ at_p.new_child(item)
69
86
  tree_down(item) do
70
- set_meta_tag(:sub_page_tag,join_xpath(path),{attr: [:attr,:href]})
87
+ set_meta_tag(:crawl_tag,join_xpath(path),{attr: [:attr,:href]})
71
88
  block.call
72
89
  end
73
90
  end
@@ -82,12 +99,16 @@ module Dynamised
82
99
  set_info(:meta,name,xpath,meta)
83
100
  end
84
101
 
85
- def writer(writers)
86
- @writer = writers
102
+ def writer(writers=nil,&block)
103
+ @writer = writers || block
87
104
  end
88
105
 
89
106
  private
90
107
 
108
+ def at_p
109
+ @tree[@tree_pointer]
110
+ end
111
+
91
112
  def check_for_block(&block)
92
113
  raise "No block given for #%s" % caller[0][/`.*'/][1..-2] unless block_given?
93
114
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: Dynamised
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Becker