entp-ruby-openid 2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +215 -0
- data/INSTALL +47 -0
- data/LICENSE +210 -0
- data/NOTICE +2 -0
- data/README +85 -0
- data/UPGRADE +127 -0
- data/admin/runtests.rb +45 -0
- data/examples/README +32 -0
- data/examples/active_record_openid_store/README +58 -0
- data/examples/active_record_openid_store/XXX_add_open_id_store_to_db.rb +24 -0
- data/examples/active_record_openid_store/XXX_upgrade_open_id_store.rb +26 -0
- data/examples/active_record_openid_store/init.rb +8 -0
- data/examples/active_record_openid_store/lib/association.rb +10 -0
- data/examples/active_record_openid_store/lib/nonce.rb +3 -0
- data/examples/active_record_openid_store/lib/open_id_setting.rb +4 -0
- data/examples/active_record_openid_store/lib/openid_ar_store.rb +57 -0
- data/examples/active_record_openid_store/test/store_test.rb +212 -0
- data/examples/discover +49 -0
- data/examples/rails_openid/README +153 -0
- data/examples/rails_openid/Rakefile +10 -0
- data/examples/rails_openid/app/controllers/application.rb +4 -0
- data/examples/rails_openid/app/controllers/consumer_controller.rb +125 -0
- data/examples/rails_openid/app/controllers/login_controller.rb +45 -0
- data/examples/rails_openid/app/controllers/server_controller.rb +265 -0
- data/examples/rails_openid/app/helpers/application_helper.rb +3 -0
- data/examples/rails_openid/app/helpers/login_helper.rb +2 -0
- data/examples/rails_openid/app/helpers/server_helper.rb +9 -0
- data/examples/rails_openid/app/views/consumer/index.rhtml +81 -0
- data/examples/rails_openid/app/views/layouts/server.rhtml +68 -0
- data/examples/rails_openid/app/views/login/index.rhtml +56 -0
- data/examples/rails_openid/app/views/server/decide.rhtml +26 -0
- data/examples/rails_openid/config/boot.rb +19 -0
- data/examples/rails_openid/config/database.yml +74 -0
- data/examples/rails_openid/config/environment.rb +54 -0
- data/examples/rails_openid/config/environments/development.rb +19 -0
- data/examples/rails_openid/config/environments/production.rb +19 -0
- data/examples/rails_openid/config/environments/test.rb +19 -0
- data/examples/rails_openid/config/routes.rb +24 -0
- data/examples/rails_openid/doc/README_FOR_APP +2 -0
- data/examples/rails_openid/public/404.html +8 -0
- data/examples/rails_openid/public/500.html +8 -0
- data/examples/rails_openid/public/dispatch.cgi +12 -0
- data/examples/rails_openid/public/dispatch.fcgi +26 -0
- data/examples/rails_openid/public/dispatch.rb +12 -0
- data/examples/rails_openid/public/favicon.ico +0 -0
- data/examples/rails_openid/public/images/openid_login_bg.gif +0 -0
- data/examples/rails_openid/public/javascripts/controls.js +750 -0
- data/examples/rails_openid/public/javascripts/dragdrop.js +584 -0
- data/examples/rails_openid/public/javascripts/effects.js +854 -0
- data/examples/rails_openid/public/javascripts/prototype.js +1785 -0
- data/examples/rails_openid/public/robots.txt +1 -0
- data/examples/rails_openid/script/about +3 -0
- data/examples/rails_openid/script/breakpointer +3 -0
- data/examples/rails_openid/script/console +3 -0
- data/examples/rails_openid/script/destroy +3 -0
- data/examples/rails_openid/script/generate +3 -0
- data/examples/rails_openid/script/performance/benchmarker +3 -0
- data/examples/rails_openid/script/performance/profiler +3 -0
- data/examples/rails_openid/script/plugin +3 -0
- data/examples/rails_openid/script/process/reaper +3 -0
- data/examples/rails_openid/script/process/spawner +3 -0
- data/examples/rails_openid/script/process/spinner +3 -0
- data/examples/rails_openid/script/runner +3 -0
- data/examples/rails_openid/script/server +3 -0
- data/examples/rails_openid/test/functional/login_controller_test.rb +18 -0
- data/examples/rails_openid/test/functional/server_controller_test.rb +18 -0
- data/examples/rails_openid/test/test_helper.rb +28 -0
- data/lib/hmac/hmac.rb +112 -0
- data/lib/hmac/sha1.rb +11 -0
- data/lib/hmac/sha2.rb +25 -0
- data/lib/openid.rb +22 -0
- data/lib/openid/association.rb +249 -0
- data/lib/openid/consumer.rb +395 -0
- data/lib/openid/consumer/associationmanager.rb +344 -0
- data/lib/openid/consumer/checkid_request.rb +186 -0
- data/lib/openid/consumer/discovery.rb +497 -0
- data/lib/openid/consumer/discovery_manager.rb +123 -0
- data/lib/openid/consumer/html_parse.rb +134 -0
- data/lib/openid/consumer/idres.rb +523 -0
- data/lib/openid/consumer/responses.rb +150 -0
- data/lib/openid/cryptutil.rb +115 -0
- data/lib/openid/dh.rb +89 -0
- data/lib/openid/extension.rb +39 -0
- data/lib/openid/extensions/ax.rb +539 -0
- data/lib/openid/extensions/oauth.rb +91 -0
- data/lib/openid/extensions/pape.rb +179 -0
- data/lib/openid/extensions/sreg.rb +277 -0
- data/lib/openid/extras.rb +11 -0
- data/lib/openid/fetchers.rb +258 -0
- data/lib/openid/kvform.rb +136 -0
- data/lib/openid/kvpost.rb +58 -0
- data/lib/openid/message.rb +553 -0
- data/lib/openid/protocolerror.rb +12 -0
- data/lib/openid/server.rb +1544 -0
- data/lib/openid/store.rb +10 -0
- data/lib/openid/store/filesystem.rb +272 -0
- data/lib/openid/store/interface.rb +75 -0
- data/lib/openid/store/memcache.rb +109 -0
- data/lib/openid/store/memory.rb +84 -0
- data/lib/openid/store/nonce.rb +68 -0
- data/lib/openid/trustroot.rb +349 -0
- data/lib/openid/urinorm.rb +75 -0
- data/lib/openid/util.rb +119 -0
- data/lib/openid/version.rb +3 -0
- data/lib/openid/yadis.rb +15 -0
- data/lib/openid/yadis/accept.rb +148 -0
- data/lib/openid/yadis/constants.rb +21 -0
- data/lib/openid/yadis/discovery.rb +153 -0
- data/lib/openid/yadis/filters.rb +205 -0
- data/lib/openid/yadis/htmltokenizer.rb +305 -0
- data/lib/openid/yadis/parsehtml.rb +45 -0
- data/lib/openid/yadis/services.rb +42 -0
- data/lib/openid/yadis/xrds.rb +155 -0
- data/lib/openid/yadis/xri.rb +90 -0
- data/lib/openid/yadis/xrires.rb +91 -0
- data/test/data/test_discover/openid_utf8.html +11 -0
- data/test/support/test_data_mixin.rb +127 -0
- data/test/support/test_util.rb +53 -0
- data/test/support/yadis_data.rb +131 -0
- data/test/support/yadis_data/accept.txt +124 -0
- data/test/support/yadis_data/dh.txt +29 -0
- data/test/support/yadis_data/example-xrds.xml +14 -0
- data/test/support/yadis_data/linkparse.txt +587 -0
- data/test/support/yadis_data/n2b64 +650 -0
- data/test/support/yadis_data/test1-discover.txt +137 -0
- data/test/support/yadis_data/test1-parsehtml.txt +152 -0
- data/test/support/yadis_data/test_discover/malformed_meta_tag.html +19 -0
- data/test/support/yadis_data/test_discover/openid.html +11 -0
- data/test/support/yadis_data/test_discover/openid2.html +11 -0
- data/test/support/yadis_data/test_discover/openid2_xrds.xml +12 -0
- data/test/support/yadis_data/test_discover/openid2_xrds_no_local_id.xml +11 -0
- data/test/support/yadis_data/test_discover/openid_1_and_2.html +11 -0
- data/test/support/yadis_data/test_discover/openid_1_and_2_xrds.xml +16 -0
- data/test/support/yadis_data/test_discover/openid_1_and_2_xrds_bad_delegate.xml +17 -0
- data/test/support/yadis_data/test_discover/openid_and_yadis.html +12 -0
- data/test/support/yadis_data/test_discover/openid_no_delegate.html +10 -0
- data/test/support/yadis_data/test_discover/openid_utf8.html +11 -0
- data/test/support/yadis_data/test_discover/yadis_0entries.xml +12 -0
- data/test/support/yadis_data/test_discover/yadis_2_bad_local_id.xml +15 -0
- data/test/support/yadis_data/test_discover/yadis_2entries_delegate.xml +22 -0
- data/test/support/yadis_data/test_discover/yadis_2entries_idp.xml +21 -0
- data/test/support/yadis_data/test_discover/yadis_another_delegate.xml +14 -0
- data/test/support/yadis_data/test_discover/yadis_idp.xml +12 -0
- data/test/support/yadis_data/test_discover/yadis_idp_delegate.xml +13 -0
- data/test/support/yadis_data/test_discover/yadis_no_delegate.xml +11 -0
- data/test/support/yadis_data/test_xrds/=j3h.2007.11.14.xrds +25 -0
- data/test/support/yadis_data/test_xrds/README +12 -0
- data/test/support/yadis_data/test_xrds/delegated-20060809-r1.xrds +34 -0
- data/test/support/yadis_data/test_xrds/delegated-20060809-r2.xrds +34 -0
- data/test/support/yadis_data/test_xrds/delegated-20060809.xrds +34 -0
- data/test/support/yadis_data/test_xrds/no-xrd.xml +7 -0
- data/test/support/yadis_data/test_xrds/not-xrds.xml +2 -0
- data/test/support/yadis_data/test_xrds/prefixsometimes.xrds +34 -0
- data/test/support/yadis_data/test_xrds/ref.xrds +109 -0
- data/test/support/yadis_data/test_xrds/sometimesprefix.xrds +34 -0
- data/test/support/yadis_data/test_xrds/spoof1.xrds +25 -0
- data/test/support/yadis_data/test_xrds/spoof2.xrds +25 -0
- data/test/support/yadis_data/test_xrds/spoof3.xrds +37 -0
- data/test/support/yadis_data/test_xrds/status222.xrds +9 -0
- data/test/support/yadis_data/test_xrds/subsegments.xrds +58 -0
- data/test/support/yadis_data/test_xrds/valid-populated-xrds.xml +39 -0
- data/test/support/yadis_data/trustroot.txt +153 -0
- data/test/support/yadis_data/urinorm.txt +79 -0
- data/test/test_accept.rb +170 -0
- data/test/test_association.rb +268 -0
- data/test/test_associationmanager.rb +918 -0
- data/test/test_ax.rb +690 -0
- data/test/test_checkid_request.rb +293 -0
- data/test/test_consumer.rb +260 -0
- data/test/test_cryptutil.rb +119 -0
- data/test/test_dh.rb +85 -0
- data/test/test_discover.rb +848 -0
- data/test/test_discovery_manager.rb +259 -0
- data/test/test_extension.rb +46 -0
- data/test/test_extras.rb +35 -0
- data/test/test_fetchers.rb +554 -0
- data/test/test_filters.rb +269 -0
- data/test/test_helper.rb +4 -0
- data/test/test_idres.rb +961 -0
- data/test/test_kvform.rb +164 -0
- data/test/test_kvpost.rb +64 -0
- data/test/test_linkparse.rb +100 -0
- data/test/test_message.rb +1115 -0
- data/test/test_nonce.rb +89 -0
- data/test/test_oauth.rb +176 -0
- data/test/test_openid_yadis.rb +177 -0
- data/test/test_pape.rb +248 -0
- data/test/test_parsehtml.rb +79 -0
- data/test/test_responses.rb +63 -0
- data/test/test_server.rb +2455 -0
- data/test/test_sreg.rb +479 -0
- data/test/test_stores.rb +292 -0
- data/test/test_trustroot.rb +111 -0
- data/test/test_urinorm.rb +34 -0
- data/test/test_util.rb +145 -0
- data/test/test_xrds.rb +167 -0
- data/test/test_xri.rb +48 -0
- data/test/test_xrires.rb +67 -0
- data/test/test_yadis_discovery.rb +218 -0
- metadata +268 -0
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# This file contains functions and classes used for extracting
|
|
2
|
+
# endpoint information out of a Yadis XRD file using the REXML
|
|
3
|
+
# XML parser.
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
module OpenID
|
|
7
|
+
module Yadis
|
|
8
|
+
class BasicServiceEndpoint
|
|
9
|
+
attr_reader :type_uris, :yadis_url, :uri, :service_element
|
|
10
|
+
|
|
11
|
+
# Generic endpoint object that contains parsed service
|
|
12
|
+
# information, as well as a reference to the service element
|
|
13
|
+
# from which it was generated. If there is more than one
|
|
14
|
+
# xrd:Type or xrd:URI in the xrd:Service, this object represents
|
|
15
|
+
# just one of those pairs.
|
|
16
|
+
#
|
|
17
|
+
# This object can be used as a filter, because it implements
|
|
18
|
+
# fromBasicServiceEndpoint.
|
|
19
|
+
#
|
|
20
|
+
# The simplest kind of filter you can write implements
|
|
21
|
+
# fromBasicServiceEndpoint, which takes one of these objects.
|
|
22
|
+
def initialize(yadis_url, type_uris, uri, service_element)
|
|
23
|
+
@type_uris = type_uris
|
|
24
|
+
@yadis_url = yadis_url
|
|
25
|
+
@uri = uri
|
|
26
|
+
@service_element = service_element
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Query this endpoint to see if it has any of the given type
|
|
30
|
+
# URIs. This is useful for implementing other endpoint classes
|
|
31
|
+
# that e.g. need to check for the presence of multiple
|
|
32
|
+
# versions of a single protocol.
|
|
33
|
+
def match_types(type_uris)
|
|
34
|
+
return @type_uris & type_uris
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Trivial transform from a basic endpoint to itself. This
|
|
38
|
+
# method exists to allow BasicServiceEndpoint to be used as a
|
|
39
|
+
# filter.
|
|
40
|
+
#
|
|
41
|
+
# If you are subclassing this object, re-implement this function.
|
|
42
|
+
def self.from_basic_service_endpoint(endpoint)
|
|
43
|
+
return endpoint
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# A hack to make both this class and its instances respond to
|
|
47
|
+
# this message since Ruby doesn't support static methods.
|
|
48
|
+
def from_basic_service_endpoint(endpoint)
|
|
49
|
+
return self.class.from_basic_service_endpoint(endpoint)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Take a list of basic filters and makes a filter that
|
|
55
|
+
# transforms the basic filter into a top-level filter. This is
|
|
56
|
+
# mostly useful for the implementation of make_filter, which
|
|
57
|
+
# should only be needed for special cases or internal use by
|
|
58
|
+
# this library.
|
|
59
|
+
#
|
|
60
|
+
# This object is useful for creating simple filters for services
|
|
61
|
+
# that use one URI and are specified by one Type (we expect most
|
|
62
|
+
# Types will fit this paradigm).
|
|
63
|
+
#
|
|
64
|
+
# Creates a BasicServiceEndpoint object and apply the filter
|
|
65
|
+
# functions to it until one of them returns a value.
|
|
66
|
+
class TransformFilterMaker
|
|
67
|
+
attr_reader :filter_procs
|
|
68
|
+
|
|
69
|
+
# Initialize the filter maker's state
|
|
70
|
+
#
|
|
71
|
+
# filter_functions are the endpoint transformer
|
|
72
|
+
# Procs to apply to the basic endpoint. These are called in
|
|
73
|
+
# turn until one of them does not return nil, and the result
|
|
74
|
+
# of that transformer is returned.
|
|
75
|
+
def initialize(filter_procs)
|
|
76
|
+
@filter_procs = filter_procs
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Returns an array of endpoint objects produced by the
|
|
80
|
+
# filter procs.
|
|
81
|
+
def get_service_endpoints(yadis_url, service_element)
|
|
82
|
+
endpoints = []
|
|
83
|
+
|
|
84
|
+
# Do an expansion of the service element by xrd:Type and
|
|
85
|
+
# xrd:URI
|
|
86
|
+
Yadis::expand_service(service_element).each { |type_uris, uri, _|
|
|
87
|
+
# Create a basic endpoint object to represent this
|
|
88
|
+
# yadis_url, Service, Type, URI combination
|
|
89
|
+
endpoint = BasicServiceEndpoint.new(
|
|
90
|
+
yadis_url, type_uris, uri, service_element)
|
|
91
|
+
|
|
92
|
+
e = apply_filters(endpoint)
|
|
93
|
+
if !e.nil?
|
|
94
|
+
endpoints << e
|
|
95
|
+
end
|
|
96
|
+
}
|
|
97
|
+
return endpoints
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def apply_filters(endpoint)
|
|
101
|
+
# Apply filter procs to an endpoint until one of them returns
|
|
102
|
+
# non-nil.
|
|
103
|
+
@filter_procs.each { |filter_proc|
|
|
104
|
+
e = filter_proc.call(endpoint)
|
|
105
|
+
if !e.nil?
|
|
106
|
+
# Once one of the filters has returned an endpoint, do not
|
|
107
|
+
# apply any more.
|
|
108
|
+
return e
|
|
109
|
+
end
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return nil
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
class CompoundFilter
|
|
117
|
+
attr_reader :subfilters
|
|
118
|
+
|
|
119
|
+
# Create a new filter that applies a set of filters to an
|
|
120
|
+
# endpoint and collects their results.
|
|
121
|
+
def initialize(subfilters)
|
|
122
|
+
@subfilters = subfilters
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Generate all endpoint objects for all of the subfilters of
|
|
126
|
+
# this filter and return their concatenation.
|
|
127
|
+
def get_service_endpoints(yadis_url, service_element)
|
|
128
|
+
endpoints = []
|
|
129
|
+
@subfilters.each { |subfilter|
|
|
130
|
+
endpoints += subfilter.get_service_endpoints(yadis_url, service_element)
|
|
131
|
+
}
|
|
132
|
+
return endpoints
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Exception raised when something is not able to be turned into a
|
|
137
|
+
# filter
|
|
138
|
+
@@filter_type_error = TypeError.new(
|
|
139
|
+
'Expected a filter, an endpoint, a callable or a list of any of these.')
|
|
140
|
+
|
|
141
|
+
# Convert a filter-convertable thing into a filter
|
|
142
|
+
#
|
|
143
|
+
# parts should be a filter, an endpoint, a callable, or a list of
|
|
144
|
+
# any of these.
|
|
145
|
+
def self.make_filter(parts)
|
|
146
|
+
# Convert the parts into a list, and pass to mk_compound_filter
|
|
147
|
+
if parts.nil?
|
|
148
|
+
parts = [BasicServiceEndpoint]
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
if parts.is_a?(Array)
|
|
152
|
+
return mk_compound_filter(parts)
|
|
153
|
+
else
|
|
154
|
+
return mk_compound_filter([parts])
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Create a filter out of a list of filter-like things
|
|
159
|
+
#
|
|
160
|
+
# Used by make_filter
|
|
161
|
+
#
|
|
162
|
+
# parts should be a list of things that can be passed to make_filter
|
|
163
|
+
def self.mk_compound_filter(parts)
|
|
164
|
+
|
|
165
|
+
if !parts.respond_to?('each')
|
|
166
|
+
raise TypeError, "#{parts.inspect} is not iterable"
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Separate into a list of callables and a list of filter objects
|
|
170
|
+
transformers = []
|
|
171
|
+
filters = []
|
|
172
|
+
parts.each { |subfilter|
|
|
173
|
+
if !subfilter.is_a?(Array)
|
|
174
|
+
# If it's not an iterable
|
|
175
|
+
if subfilter.respond_to?('get_service_endpoints')
|
|
176
|
+
# It's a full filter
|
|
177
|
+
filters << subfilter
|
|
178
|
+
elsif subfilter.respond_to?('from_basic_service_endpoint')
|
|
179
|
+
# It's an endpoint object, so put its endpoint conversion
|
|
180
|
+
# attribute into the list of endpoint transformers
|
|
181
|
+
transformers << subfilter.method('from_basic_service_endpoint')
|
|
182
|
+
elsif subfilter.respond_to?('call')
|
|
183
|
+
# It's a proc, so add it to the list of endpoint
|
|
184
|
+
# transformers
|
|
185
|
+
transformers << subfilter
|
|
186
|
+
else
|
|
187
|
+
raise @@filter_type_error
|
|
188
|
+
end
|
|
189
|
+
else
|
|
190
|
+
filters << mk_compound_filter(subfilter)
|
|
191
|
+
end
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if transformers.length > 0
|
|
195
|
+
filters << TransformFilterMaker.new(transformers)
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
if filters.length == 1
|
|
199
|
+
return filters[0]
|
|
200
|
+
else
|
|
201
|
+
return CompoundFilter.new(filters)
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
end
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
# = HTMLTokenizer
|
|
2
|
+
#
|
|
3
|
+
# Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com)
|
|
4
|
+
# Copyright:: Copyright (c) 2004 Ben Giddings
|
|
5
|
+
# License:: Distributes under the same terms as Ruby
|
|
6
|
+
#
|
|
7
|
+
#
|
|
8
|
+
# This is a partial port of the functionality behind Perl's TokeParser
|
|
9
|
+
# Provided a page it progressively returns tokens from that page
|
|
10
|
+
#
|
|
11
|
+
# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $
|
|
12
|
+
|
|
13
|
+
#
|
|
14
|
+
# A class to tokenize HTML.
|
|
15
|
+
#
|
|
16
|
+
# Example:
|
|
17
|
+
#
|
|
18
|
+
# page = "<HTML>
|
|
19
|
+
# <HEAD>
|
|
20
|
+
# <TITLE>This is the title</TITLE>
|
|
21
|
+
# </HEAD>
|
|
22
|
+
# <!-- Here comes the <a href=\"missing.link\">blah</a>
|
|
23
|
+
# comment body
|
|
24
|
+
# -->
|
|
25
|
+
# <BODY>
|
|
26
|
+
# <H1>This is the header</H1>
|
|
27
|
+
# <P>
|
|
28
|
+
# This is the paragraph, it contains
|
|
29
|
+
# <a href=\"link.html\">links</a>,
|
|
30
|
+
# <img src=\"blah.gif\" optional alt='images
|
|
31
|
+
# are
|
|
32
|
+
# really cool'>. Ok, here is some more text and
|
|
33
|
+
# <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
|
|
34
|
+
# </P>
|
|
35
|
+
# </body>
|
|
36
|
+
# </HTML>
|
|
37
|
+
# "
|
|
38
|
+
# toke = HTMLTokenizer.new(page)
|
|
39
|
+
#
|
|
40
|
+
# assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
|
|
41
|
+
# assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
|
|
42
|
+
# assert("links" == toke.getTrimmedText)
|
|
43
|
+
# assert(toke.getTag("IMG", "A").attr_hash['optional'])
|
|
44
|
+
# assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
|
|
45
|
+
#
|
|
46
|
+
class HTMLTokenizer
|
|
47
|
+
@@version = 1.0
|
|
48
|
+
|
|
49
|
+
# Get version of HTMLTokenizer lib
|
|
50
|
+
def self.version
|
|
51
|
+
@@version
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
attr_reader :page
|
|
55
|
+
|
|
56
|
+
# Create a new tokenizer, based on the content, used as a string.
|
|
57
|
+
def initialize(content)
|
|
58
|
+
@page = content.to_s
|
|
59
|
+
@cur_pos = 0
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Reset the parser, setting the current position back at the stop
|
|
63
|
+
def reset
|
|
64
|
+
@cur_pos = 0
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Look at the next token, but don't actually grab it
|
|
68
|
+
def peekNextToken
|
|
69
|
+
if @cur_pos == @page.length then return nil end
|
|
70
|
+
|
|
71
|
+
if ?< == @page[@cur_pos]
|
|
72
|
+
# Next token is a tag of some kind
|
|
73
|
+
if '!--' == @page[(@cur_pos + 1), 3]
|
|
74
|
+
# Token is a comment
|
|
75
|
+
tag_end = @page.index('-->', (@cur_pos + 1))
|
|
76
|
+
if tag_end.nil?
|
|
77
|
+
raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"
|
|
78
|
+
end
|
|
79
|
+
# p @page[@cur_pos .. (tag_end+2)]
|
|
80
|
+
HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
|
|
81
|
+
else
|
|
82
|
+
# Token is a html tag
|
|
83
|
+
tag_end = @page.index('>', (@cur_pos + 1))
|
|
84
|
+
if tag_end.nil?
|
|
85
|
+
raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"
|
|
86
|
+
end
|
|
87
|
+
# p @page[@cur_pos .. tag_end]
|
|
88
|
+
HTMLTag.new(@page[@cur_pos .. tag_end])
|
|
89
|
+
end
|
|
90
|
+
else
|
|
91
|
+
# Next token is text
|
|
92
|
+
text_end = @page.index('<', @cur_pos)
|
|
93
|
+
text_end = text_end.nil? ? -1 : (text_end - 1)
|
|
94
|
+
# p @page[@cur_pos .. text_end]
|
|
95
|
+
HTMLText.new(@page[@cur_pos .. text_end])
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Get the next token, returns an instance of
|
|
100
|
+
# * HTMLText
|
|
101
|
+
# * HTMLToken
|
|
102
|
+
# * HTMLTag
|
|
103
|
+
def getNextToken
|
|
104
|
+
token = peekNextToken
|
|
105
|
+
if token
|
|
106
|
+
# @page = @page[token.raw.length .. -1]
|
|
107
|
+
# @page.slice!(0, token.raw.length)
|
|
108
|
+
@cur_pos += token.raw.length
|
|
109
|
+
end
|
|
110
|
+
#p token
|
|
111
|
+
#print token.raw
|
|
112
|
+
return token
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Get a tag from the specified set of desired tags.
|
|
116
|
+
# For example:
|
|
117
|
+
# <tt>foo = toke.getTag("h1", "h2", "h3")</tt>
|
|
118
|
+
# Will return the next header tag encountered.
|
|
119
|
+
def getTag(*sought_tags)
|
|
120
|
+
sought_tags.collect! {|elm| elm.downcase}
|
|
121
|
+
|
|
122
|
+
while (tag = getNextToken)
|
|
123
|
+
if tag.kind_of?(HTMLTag) and
|
|
124
|
+
(0 == sought_tags.length or sought_tags.include?(tag.tag_name))
|
|
125
|
+
break
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
tag
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Get all the text between the current position and the next tag
|
|
132
|
+
# (if specified) or a specific later tag
|
|
133
|
+
def getText(until_tag = nil)
|
|
134
|
+
if until_tag.nil?
|
|
135
|
+
if ?< == @page[@cur_pos]
|
|
136
|
+
# Next token is a tag, not text
|
|
137
|
+
""
|
|
138
|
+
else
|
|
139
|
+
# Next token is text
|
|
140
|
+
getNextToken.text
|
|
141
|
+
end
|
|
142
|
+
else
|
|
143
|
+
ret_str = ""
|
|
144
|
+
|
|
145
|
+
while (tag = peekNextToken)
|
|
146
|
+
if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
|
|
147
|
+
break
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
if ("" != tag.text)
|
|
151
|
+
ret_str << (tag.text + " ")
|
|
152
|
+
end
|
|
153
|
+
getNextToken
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
ret_str
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Like getText, but squeeze all whitespace, getting rid of
|
|
161
|
+
# leading and trailing whitespace, and squeezing multiple
|
|
162
|
+
# spaces into a single space.
|
|
163
|
+
def getTrimmedText(until_tag = nil)
|
|
164
|
+
getText(until_tag).strip.gsub(/\s+/m, " ")
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
class HTMLTokenizerError < Exception
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# The parent class for all three types of HTML tokens
|
|
173
|
+
class HTMLToken
|
|
174
|
+
attr_accessor :raw
|
|
175
|
+
|
|
176
|
+
# Initialize the token based on the raw text
|
|
177
|
+
def initialize(text)
|
|
178
|
+
@raw = text
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# By default, return exactly the string used to create the text
|
|
182
|
+
def to_s
|
|
183
|
+
raw
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# By default tokens have no text representation
|
|
187
|
+
def text
|
|
188
|
+
""
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def trimmed_text
|
|
192
|
+
text.strip.gsub(/\s+/m, " ")
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Compare to another based on the raw source
|
|
196
|
+
def ==(other)
|
|
197
|
+
raw == other.to_s
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Class representing text that isn't inside a tag
|
|
202
|
+
class HTMLText < HTMLToken
|
|
203
|
+
def text
|
|
204
|
+
raw
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# Class representing an HTML comment
|
|
209
|
+
class HTMLComment < HTMLToken
|
|
210
|
+
attr_accessor :contents
|
|
211
|
+
def initialize(text)
|
|
212
|
+
super(text)
|
|
213
|
+
temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
|
|
214
|
+
if temp_arr[0].nil?
|
|
215
|
+
raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
@contents = temp_arr[0][0]
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Class representing an HTML tag
|
|
223
|
+
class HTMLTag < HTMLToken
|
|
224
|
+
attr_reader :end_tag, :tag_name
|
|
225
|
+
def initialize(text)
|
|
226
|
+
super(text)
|
|
227
|
+
if ?< != text[0] or ?> != text[-1]
|
|
228
|
+
raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
@attr_hash = Hash.new
|
|
232
|
+
@raw = text
|
|
233
|
+
|
|
234
|
+
tag_name = text.scan(/[\w:-]+/)[0]
|
|
235
|
+
if tag_name.nil?
|
|
236
|
+
raise HTMLTokenizerError, "Error, tag is nil: #{tag_name}"
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
if ?/ == text[1]
|
|
240
|
+
# It's an end tag
|
|
241
|
+
@end_tag = true
|
|
242
|
+
@tag_name = '/' + tag_name.downcase
|
|
243
|
+
else
|
|
244
|
+
@end_tag = false
|
|
245
|
+
@tag_name = tag_name.downcase
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
@hashed = false
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Retrieve a hash of all the tag's attributes.
|
|
252
|
+
# Lazily done, so that if you don't look at a tag's attributes
|
|
253
|
+
# things go quicker
|
|
254
|
+
def attr_hash
|
|
255
|
+
# Lazy initialize == don't build the hash until it's needed
|
|
256
|
+
if !@hashed
|
|
257
|
+
if !@end_tag
|
|
258
|
+
# Get the attributes
|
|
259
|
+
attr_arr = @raw.scan(/<[\w:-]+\s+(.*?)\/?>/m)[0]
|
|
260
|
+
if attr_arr.kind_of?(Array)
|
|
261
|
+
# Attributes found, parse them
|
|
262
|
+
attrs = attr_arr[0]
|
|
263
|
+
attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
|
|
264
|
+
# clean up the array by:
|
|
265
|
+
# * setting all nil elements to true
|
|
266
|
+
# * removing enclosing quotes
|
|
267
|
+
attr_arr.each {
|
|
268
|
+
|item|
|
|
269
|
+
val = if item[1].nil?
|
|
270
|
+
item[0]
|
|
271
|
+
elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
|
|
272
|
+
item[1][1 .. -2]
|
|
273
|
+
else
|
|
274
|
+
item[1]
|
|
275
|
+
end
|
|
276
|
+
@attr_hash[item[0].downcase] = val
|
|
277
|
+
}
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
@hashed = true
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
#p self
|
|
284
|
+
|
|
285
|
+
@attr_hash
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
# Get the 'alt' text for a tag, if it exists, or an empty string otherwise
|
|
289
|
+
def text
|
|
290
|
+
if !end_tag
|
|
291
|
+
case tag_name
|
|
292
|
+
when 'img'
|
|
293
|
+
if !attr_hash['alt'].nil?
|
|
294
|
+
return attr_hash['alt']
|
|
295
|
+
end
|
|
296
|
+
when 'applet'
|
|
297
|
+
if !attr_hash['alt'].nil?
|
|
298
|
+
return attr_hash['alt']
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
return ''
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
|