mechanize 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of mechanize might be problematic. Click here for more details.
- data/README +15 -0
- data/examples/rubyforge.rb +13 -0
- data/lib/mechanize.rb +447 -0
- data/lib/mechanize/net-overrides/net/http.rb +2107 -0
- data/lib/mechanize/net-overrides/net/https.rb +171 -0
- data/lib/mechanize/net-overrides/net/protocol.rb +380 -0
- data/lib/mechanize/parsing.rb +200 -0
- data/mechanize.gemspec +22 -0
- metadata +59 -0
@@ -0,0 +1,200 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2005 by Michael Neumann (mneumann@ntecs.de).
|
3
|
+
# Released under the same terms of license as Ruby.
|
4
|
+
#
|
5
|
+
|
6
|
+
require 'rexml/rexml'
|
7
|
+
|
8
|
+
module REXML::Node
|
9
|
+
|
10
|
+
# Visit all subnodes of +self+ recursively
|
11
|
+
|
12
|
+
def each_recursive(&block) # :yields: node
|
13
|
+
self.elements.each {|node|
|
14
|
+
block.call(node)
|
15
|
+
node.each_recursive(&block)
|
16
|
+
}
|
17
|
+
end
|
18
|
+
|
19
|
+
# Find (and return) first subnode (recursively) for which the block evaluates
|
20
|
+
# to true. Returns +nil+ if none was found.
|
21
|
+
|
22
|
+
def find_first_recursive(&block) # :yields: node
|
23
|
+
each_recursive {|node|
|
24
|
+
return node if block.call(node)
|
25
|
+
}
|
26
|
+
return nil
|
27
|
+
end
|
28
|
+
|
29
|
+
# Find all subnodes (recursively) for which the block evaluates to true.
|
30
|
+
|
31
|
+
def find_all_recursive(&block) # :yields: node
|
32
|
+
arr = []
|
33
|
+
each_recursive {|node|
|
34
|
+
arr << node if block.call(node)
|
35
|
+
}
|
36
|
+
arr
|
37
|
+
end
|
38
|
+
|
39
|
+
# Returns the index that +self+ has in its parent's elements array, so that
|
40
|
+
# the following equation holds true:
|
41
|
+
#
|
42
|
+
# node == node.parent.elements[node.index_in_parent]
|
43
|
+
|
44
|
+
def index_in_parent
|
45
|
+
parent.index(self)+1
|
46
|
+
end
|
47
|
+
|
48
|
+
# Recursivly collects all text strings starting into an array.
|
49
|
+
#
|
50
|
+
# E.g. the method would return [["abc"], "def"] for this node:
|
51
|
+
#
|
52
|
+
# <i><b>abc</b>def</i>
|
53
|
+
|
54
|
+
def collect_text_recursively
|
55
|
+
(elements.map {|n| n.collect_text_recursively} + [self.text]).compact
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns all text of all subnodes (recursivly), merged into one string.
|
59
|
+
# This is equivalent to:
|
60
|
+
#
|
61
|
+
# collect_text_recursively.flatten.join("")
|
62
|
+
|
63
|
+
def all_text
|
64
|
+
collect_text_recursively.flatten.join("")
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
#
|
70
|
+
# Starting with +root_node+, we recursively look for a node with the given
|
71
|
+
# +tag+, the given +attributes+ (a Hash) and whoose text equals or matches the
|
72
|
+
# +text+ string or regular expression.
|
73
|
+
#
|
74
|
+
# To find the following node:
|
75
|
+
#
|
76
|
+
# <td class='abc'>text</td>
|
77
|
+
#
|
78
|
+
# We use:
|
79
|
+
#
|
80
|
+
# find_node(root, 'td', {'class' => 'abc'}, "text")
|
81
|
+
#
|
82
|
+
# Returns +nil+ if no matching node was found.
|
83
|
+
|
84
|
+
def find_node(root_node, tag, attributes, text=nil)
|
85
|
+
root_node.find_first_recursive {|node|
|
86
|
+
node.name == tag and
|
87
|
+
attributes.all? {|attr, val| node.attributes[attr] == val} and
|
88
|
+
(text ? text === node.text : true)
|
89
|
+
}
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# Extract specific columns (specified by the position of it's corrensponding
|
94
|
+
# header column) from a table.
|
95
|
+
#
|
96
|
+
# Given the following table:
|
97
|
+
#
|
98
|
+
# <table>
|
99
|
+
# <tr>
|
100
|
+
# <td>A</td>
|
101
|
+
# <td>B</td>
|
102
|
+
# <td>C</td>
|
103
|
+
# </tr>
|
104
|
+
# <tr>
|
105
|
+
# <td>A.1</td>
|
106
|
+
# <td>B.1</td>
|
107
|
+
# <td>C.1</td>
|
108
|
+
# </tr>
|
109
|
+
# <tr>
|
110
|
+
# <td>A.2</td>
|
111
|
+
# <td>B.2</td>
|
112
|
+
# <td>C.2</td>
|
113
|
+
# </tr>
|
114
|
+
# </table>
|
115
|
+
#
|
116
|
+
# To extract the first (A) and last (C) column:
|
117
|
+
#
|
118
|
+
# extract_from_table(root_node, ["A", "C"])
|
119
|
+
#
|
120
|
+
# And you get this as result:
|
121
|
+
#
|
122
|
+
# [
|
123
|
+
# ["A.1", "C.1"],
|
124
|
+
# ["A.2", "C.2"]
|
125
|
+
# ]
|
126
|
+
#
|
127
|
+
|
128
|
+
def extract_from_table(root_node, headers)
|
129
|
+
|
130
|
+
# extract and collect all header nodes
|
131
|
+
|
132
|
+
header_nodes = headers.collect { |header| find_node(root_node, 'td', {}, header) }
|
133
|
+
|
134
|
+
raise "some headers not found" if header_nodes.compact.size < headers.size
|
135
|
+
|
136
|
+
# assert that all headers have the same parent 'header_row', which is the row
|
137
|
+
# in which the header_nodes are contained. 'table' is the surrounding table tag.
|
138
|
+
|
139
|
+
header_row = header_nodes.first.parent
|
140
|
+
table = header_row.parent
|
141
|
+
|
142
|
+
raise "different parents" unless header_nodes.all? {|n| n.parent == header_row}
|
143
|
+
|
144
|
+
# we now iterate over all rows in the table that follows the header_row.
|
145
|
+
# for each row we collect the elements at the same positions as the header_nodes.
|
146
|
+
# this is what we finally return from the method.
|
147
|
+
|
148
|
+
(header_row.index_in_parent+1 .. table.elements.size).collect do |inx|
|
149
|
+
row = table.elements[inx]
|
150
|
+
header_nodes.collect { |n| row.elements[ n.index_in_parent ].text }
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
# Given a HTML table, this method returns a matrix (2-dim array), with all the
|
155
|
+
# table-data elements correctly placed in it.
|
156
|
+
#
|
157
|
+
# If there's a table data element which uses 'colspan', that node is stored in
|
158
|
+
# at the current position of the row followed by (colspan-1) nil values.
|
159
|
+
#
|
160
|
+
# Example:
|
161
|
+
#
|
162
|
+
# <table>
|
163
|
+
# <tr>
|
164
|
+
# <td>A</td>
|
165
|
+
# <td>B</td>
|
166
|
+
# </tr>
|
167
|
+
# <tr>
|
168
|
+
# <td colspan="2">C</td>
|
169
|
+
# </tr>
|
170
|
+
# </table>
|
171
|
+
#
|
172
|
+
# Result:
|
173
|
+
#
|
174
|
+
# [
|
175
|
+
# [A, B],
|
176
|
+
# [C, nil]
|
177
|
+
# ]
|
178
|
+
#
|
179
|
+
# where A, B and C are the corresponding "<td>" nodes.
|
180
|
+
#
|
181
|
+
|
182
|
+
def table_to_matrix(table_node)
|
183
|
+
matrix = []
|
184
|
+
|
185
|
+
# for each row
|
186
|
+
table_node.elements.each('tr') {|r|
|
187
|
+
row = []
|
188
|
+
r.elements.each {|data|
|
189
|
+
next unless ['td', 'th'].include?(data.name)
|
190
|
+
row << data
|
191
|
+
|
192
|
+
# fill with empty elements
|
193
|
+
colspan = (data.attributes['colspan'] || 1).to_i
|
194
|
+
(colspan - 1).times { row << nil }
|
195
|
+
}
|
196
|
+
matrix << row
|
197
|
+
}
|
198
|
+
|
199
|
+
return matrix
|
200
|
+
end
|
data/mechanize.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
if File.read('lib/mechanize.rb') =~ /Version\s+=\s+"(\d+\.\d+\.\d+)"/
|
4
|
+
version = $1
|
5
|
+
else
|
6
|
+
raise "no version"
|
7
|
+
end
|
8
|
+
|
9
|
+
spec = Gem::Specification.new do |s|
|
10
|
+
s.name = 'mechanize'
|
11
|
+
s.version = version
|
12
|
+
s.summary = 'Automated web-browsing.'
|
13
|
+
s.add_dependency('narf', '>= 0.6.3')
|
14
|
+
|
15
|
+
s.files = Dir['**/*'].delete_if {|item| item.include?(".svn") }
|
16
|
+
|
17
|
+
s.require_path = 'lib'
|
18
|
+
|
19
|
+
s.author = "Michael Neumann"
|
20
|
+
s.email = "mneumann@ntecs.de"
|
21
|
+
s.homepage = "rubyforge.org/projects/wee"
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.8.4
|
3
|
+
specification_version: 1
|
4
|
+
name: mechanize
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2005-01-26
|
8
|
+
summary: Automated web-browsing.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: mneumann@ntecs.de
|
12
|
+
homepage: rubyforge.org/projects/wee
|
13
|
+
rubyforge_project:
|
14
|
+
description:
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: false
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
-
|
22
|
+
- ">"
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 0.0.0
|
25
|
+
version:
|
26
|
+
platform: ruby
|
27
|
+
authors:
|
28
|
+
- Michael Neumann
|
29
|
+
files:
|
30
|
+
- examples
|
31
|
+
- lib
|
32
|
+
- mechanize.gemspec
|
33
|
+
- README
|
34
|
+
- examples/rubyforge.rb
|
35
|
+
- lib/mechanize
|
36
|
+
- lib/mechanize.rb
|
37
|
+
- lib/mechanize/net-overrides
|
38
|
+
- lib/mechanize/parsing.rb
|
39
|
+
- lib/mechanize/net-overrides/net
|
40
|
+
- lib/mechanize/net-overrides/net/protocol.rb
|
41
|
+
- lib/mechanize/net-overrides/net/http.rb
|
42
|
+
- lib/mechanize/net-overrides/net/https.rb
|
43
|
+
test_files: []
|
44
|
+
rdoc_options: []
|
45
|
+
extra_rdoc_files: []
|
46
|
+
executables: []
|
47
|
+
extensions: []
|
48
|
+
requirements: []
|
49
|
+
dependencies:
|
50
|
+
- !ruby/object:Gem::Dependency
|
51
|
+
name: narf
|
52
|
+
version_requirement:
|
53
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
54
|
+
requirements:
|
55
|
+
-
|
56
|
+
- ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: 0.6.3
|
59
|
+
version:
|