nov-ruby-openid 2.1.9
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +215 -0
- data/CHANGES-2.1.0 +36 -0
- data/INSTALL +47 -0
- data/LICENSE +210 -0
- data/NOTICE +2 -0
- data/README +81 -0
- data/Rakefile +98 -0
- data/UPGRADE +127 -0
- data/VERSION +1 -0
- data/contrib/google/ruby-openid-apps-discovery-1.0.gem +0 -0
- data/contrib/google/ruby-openid-apps-discovery-1.01.gem +0 -0
- data/examples/README +32 -0
- data/examples/active_record_openid_store/README +58 -0
- data/examples/active_record_openid_store/XXX_add_open_id_store_to_db.rb +24 -0
- data/examples/active_record_openid_store/XXX_upgrade_open_id_store.rb +26 -0
- data/examples/active_record_openid_store/init.rb +8 -0
- data/examples/active_record_openid_store/lib/association.rb +10 -0
- data/examples/active_record_openid_store/lib/nonce.rb +3 -0
- data/examples/active_record_openid_store/lib/open_id_setting.rb +4 -0
- data/examples/active_record_openid_store/lib/openid_ar_store.rb +57 -0
- data/examples/active_record_openid_store/test/store_test.rb +212 -0
- data/examples/discover +49 -0
- data/examples/rails_openid/README +153 -0
- data/examples/rails_openid/Rakefile +10 -0
- data/examples/rails_openid/app/controllers/application.rb +4 -0
- data/examples/rails_openid/app/controllers/consumer_controller.rb +122 -0
- data/examples/rails_openid/app/controllers/login_controller.rb +45 -0
- data/examples/rails_openid/app/controllers/server_controller.rb +265 -0
- data/examples/rails_openid/app/helpers/application_helper.rb +3 -0
- data/examples/rails_openid/app/helpers/login_helper.rb +2 -0
- data/examples/rails_openid/app/helpers/server_helper.rb +9 -0
- data/examples/rails_openid/app/views/consumer/index.rhtml +81 -0
- data/examples/rails_openid/app/views/layouts/server.rhtml +68 -0
- data/examples/rails_openid/app/views/login/index.rhtml +56 -0
- data/examples/rails_openid/app/views/server/decide.rhtml +26 -0
- data/examples/rails_openid/config/boot.rb +19 -0
- data/examples/rails_openid/config/database.yml +74 -0
- data/examples/rails_openid/config/environment.rb +54 -0
- data/examples/rails_openid/config/environments/development.rb +19 -0
- data/examples/rails_openid/config/environments/production.rb +19 -0
- data/examples/rails_openid/config/environments/test.rb +19 -0
- data/examples/rails_openid/config/routes.rb +24 -0
- data/examples/rails_openid/doc/README_FOR_APP +2 -0
- data/examples/rails_openid/public/.htaccess +40 -0
- data/examples/rails_openid/public/404.html +8 -0
- data/examples/rails_openid/public/500.html +8 -0
- data/examples/rails_openid/public/dispatch.cgi +12 -0
- data/examples/rails_openid/public/dispatch.fcgi +26 -0
- data/examples/rails_openid/public/dispatch.rb +12 -0
- data/examples/rails_openid/public/favicon.ico +0 -0
- data/examples/rails_openid/public/images/openid_login_bg.gif +0 -0
- data/examples/rails_openid/public/javascripts/controls.js +750 -0
- data/examples/rails_openid/public/javascripts/dragdrop.js +584 -0
- data/examples/rails_openid/public/javascripts/effects.js +854 -0
- data/examples/rails_openid/public/javascripts/prototype.js +1785 -0
- data/examples/rails_openid/public/robots.txt +1 -0
- data/examples/rails_openid/script/about +3 -0
- data/examples/rails_openid/script/breakpointer +3 -0
- data/examples/rails_openid/script/console +3 -0
- data/examples/rails_openid/script/destroy +3 -0
- data/examples/rails_openid/script/generate +3 -0
- data/examples/rails_openid/script/performance/benchmarker +3 -0
- data/examples/rails_openid/script/performance/profiler +3 -0
- data/examples/rails_openid/script/plugin +3 -0
- data/examples/rails_openid/script/process/reaper +3 -0
- data/examples/rails_openid/script/process/spawner +3 -0
- data/examples/rails_openid/script/process/spinner +3 -0
- data/examples/rails_openid/script/runner +3 -0
- data/examples/rails_openid/script/server +3 -0
- data/examples/rails_openid/test/functional/login_controller_test.rb +18 -0
- data/examples/rails_openid/test/functional/server_controller_test.rb +18 -0
- data/examples/rails_openid/test/test_helper.rb +28 -0
- data/lib/hmac/hmac.rb +112 -0
- data/lib/hmac/sha1.rb +11 -0
- data/lib/hmac/sha2.rb +25 -0
- data/lib/openid.rb +20 -0
- data/lib/openid/association.rb +249 -0
- data/lib/openid/consumer.rb +395 -0
- data/lib/openid/consumer/associationmanager.rb +344 -0
- data/lib/openid/consumer/checkid_request.rb +186 -0
- data/lib/openid/consumer/discovery.rb +497 -0
- data/lib/openid/consumer/discovery_manager.rb +123 -0
- data/lib/openid/consumer/html_parse.rb +134 -0
- data/lib/openid/consumer/idres.rb +523 -0
- data/lib/openid/consumer/responses.rb +148 -0
- data/lib/openid/cryptutil.rb +115 -0
- data/lib/openid/dh.rb +89 -0
- data/lib/openid/extension.rb +39 -0
- data/lib/openid/extensions/ax.rb +539 -0
- data/lib/openid/extensions/oauth.rb +91 -0
- data/lib/openid/extensions/pape.rb +179 -0
- data/lib/openid/extensions/sreg.rb +277 -0
- data/lib/openid/extensions/ui.rb +53 -0
- data/lib/openid/extras.rb +11 -0
- data/lib/openid/fetchers.rb +258 -0
- data/lib/openid/kvform.rb +136 -0
- data/lib/openid/kvpost.rb +58 -0
- data/lib/openid/message.rb +553 -0
- data/lib/openid/protocolerror.rb +8 -0
- data/lib/openid/server.rb +1544 -0
- data/lib/openid/store/filesystem.rb +271 -0
- data/lib/openid/store/interface.rb +75 -0
- data/lib/openid/store/memcache.rb +107 -0
- data/lib/openid/store/memory.rb +84 -0
- data/lib/openid/store/nonce.rb +68 -0
- data/lib/openid/trustroot.rb +349 -0
- data/lib/openid/urinorm.rb +75 -0
- data/lib/openid/util.rb +110 -0
- data/lib/openid/yadis/accept.rb +148 -0
- data/lib/openid/yadis/constants.rb +21 -0
- data/lib/openid/yadis/discovery.rb +153 -0
- data/lib/openid/yadis/filters.rb +205 -0
- data/lib/openid/yadis/htmltokenizer.rb +305 -0
- data/lib/openid/yadis/parsehtml.rb +45 -0
- data/lib/openid/yadis/services.rb +42 -0
- data/lib/openid/yadis/xrds.rb +155 -0
- data/lib/openid/yadis/xri.rb +90 -0
- data/lib/openid/yadis/xrires.rb +99 -0
- data/setup.rb +1551 -0
- data/test/data/accept.txt +124 -0
- data/test/data/dh.txt +29 -0
- data/test/data/example-xrds.xml +14 -0
- data/test/data/linkparse.txt +587 -0
- data/test/data/n2b64 +650 -0
- data/test/data/test1-discover.txt +137 -0
- data/test/data/test1-parsehtml.txt +152 -0
- data/test/data/test_discover/malformed_meta_tag.html +19 -0
- data/test/data/test_discover/openid.html +11 -0
- data/test/data/test_discover/openid2.html +11 -0
- data/test/data/test_discover/openid2_xrds.xml +12 -0
- data/test/data/test_discover/openid2_xrds_no_local_id.xml +11 -0
- data/test/data/test_discover/openid_1_and_2.html +11 -0
- data/test/data/test_discover/openid_1_and_2_xrds.xml +16 -0
- data/test/data/test_discover/openid_1_and_2_xrds_bad_delegate.xml +17 -0
- data/test/data/test_discover/openid_and_yadis.html +12 -0
- data/test/data/test_discover/openid_no_delegate.html +10 -0
- data/test/data/test_discover/openid_utf8.html +11 -0
- data/test/data/test_discover/yadis_0entries.xml +12 -0
- data/test/data/test_discover/yadis_2_bad_local_id.xml +15 -0
- data/test/data/test_discover/yadis_2entries_delegate.xml +22 -0
- data/test/data/test_discover/yadis_2entries_idp.xml +21 -0
- data/test/data/test_discover/yadis_another_delegate.xml +14 -0
- data/test/data/test_discover/yadis_idp.xml +12 -0
- data/test/data/test_discover/yadis_idp_delegate.xml +13 -0
- data/test/data/test_discover/yadis_no_delegate.xml +11 -0
- data/test/data/test_xrds/=j3h.2007.11.14.xrds +25 -0
- data/test/data/test_xrds/README +12 -0
- data/test/data/test_xrds/delegated-20060809-r1.xrds +34 -0
- data/test/data/test_xrds/delegated-20060809-r2.xrds +34 -0
- data/test/data/test_xrds/delegated-20060809.xrds +34 -0
- data/test/data/test_xrds/no-xrd.xml +7 -0
- data/test/data/test_xrds/not-xrds.xml +2 -0
- data/test/data/test_xrds/prefixsometimes.xrds +34 -0
- data/test/data/test_xrds/ref.xrds +109 -0
- data/test/data/test_xrds/sometimesprefix.xrds +34 -0
- data/test/data/test_xrds/spoof1.xrds +25 -0
- data/test/data/test_xrds/spoof2.xrds +25 -0
- data/test/data/test_xrds/spoof3.xrds +37 -0
- data/test/data/test_xrds/status222.xrds +9 -0
- data/test/data/test_xrds/subsegments.xrds +58 -0
- data/test/data/test_xrds/valid-populated-xrds.xml +39 -0
- data/test/data/trustroot.txt +153 -0
- data/test/data/urinorm.txt +79 -0
- data/test/discoverdata.rb +131 -0
- data/test/test_accept.rb +170 -0
- data/test/test_association.rb +266 -0
- data/test/test_associationmanager.rb +917 -0
- data/test/test_ax.rb +690 -0
- data/test/test_checkid_request.rb +294 -0
- data/test/test_consumer.rb +257 -0
- data/test/test_cryptutil.rb +119 -0
- data/test/test_dh.rb +86 -0
- data/test/test_discover.rb +852 -0
- data/test/test_discovery_manager.rb +262 -0
- data/test/test_extension.rb +46 -0
- data/test/test_extras.rb +35 -0
- data/test/test_fetchers.rb +565 -0
- data/test/test_filters.rb +270 -0
- data/test/test_idres.rb +963 -0
- data/test/test_kvform.rb +165 -0
- data/test/test_kvpost.rb +65 -0
- data/test/test_linkparse.rb +101 -0
- data/test/test_message.rb +1116 -0
- data/test/test_nonce.rb +89 -0
- data/test/test_oauth.rb +175 -0
- data/test/test_openid_yadis.rb +178 -0
- data/test/test_pape.rb +247 -0
- data/test/test_parsehtml.rb +80 -0
- data/test/test_responses.rb +63 -0
- data/test/test_server.rb +2457 -0
- data/test/test_sreg.rb +479 -0
- data/test/test_stores.rb +298 -0
- data/test/test_trustroot.rb +113 -0
- data/test/test_ui.rb +93 -0
- data/test/test_urinorm.rb +35 -0
- data/test/test_util.rb +145 -0
- data/test/test_xrds.rb +169 -0
- data/test/test_xri.rb +48 -0
- data/test/test_xrires.rb +63 -0
- data/test/test_yadis_discovery.rb +220 -0
- data/test/testutil.rb +127 -0
- data/test/util.rb +53 -0
- metadata +336 -0
@@ -0,0 +1,205 @@
|
|
1
|
+
# This file contains functions and classes used for extracting
|
2
|
+
# endpoint information out of a Yadis XRD file using the REXML
|
3
|
+
# XML parser.
|
4
|
+
|
5
|
+
#
|
6
|
+
module OpenID
|
7
|
+
module Yadis
|
8
|
+
class BasicServiceEndpoint
|
9
|
+
attr_reader :type_uris, :yadis_url, :uri, :service_element
|
10
|
+
|
11
|
+
# Generic endpoint object that contains parsed service
|
12
|
+
# information, as well as a reference to the service element
|
13
|
+
# from which it was generated. If there is more than one
|
14
|
+
# xrd:Type or xrd:URI in the xrd:Service, this object represents
|
15
|
+
# just one of those pairs.
|
16
|
+
#
|
17
|
+
# This object can be used as a filter, because it implements
|
18
|
+
# fromBasicServiceEndpoint.
|
19
|
+
#
|
20
|
+
# The simplest kind of filter you can write implements
|
21
|
+
# fromBasicServiceEndpoint, which takes one of these objects.
|
22
|
+
def initialize(yadis_url, type_uris, uri, service_element)
|
23
|
+
@type_uris = type_uris
|
24
|
+
@yadis_url = yadis_url
|
25
|
+
@uri = uri
|
26
|
+
@service_element = service_element
|
27
|
+
end
|
28
|
+
|
29
|
+
# Query this endpoint to see if it has any of the given type
|
30
|
+
# URIs. This is useful for implementing other endpoint classes
|
31
|
+
# that e.g. need to check for the presence of multiple
|
32
|
+
# versions of a single protocol.
|
33
|
+
def match_types(type_uris)
|
34
|
+
return @type_uris & type_uris
|
35
|
+
end
|
36
|
+
|
37
|
+
# Trivial transform from a basic endpoint to itself. This
|
38
|
+
# method exists to allow BasicServiceEndpoint to be used as a
|
39
|
+
# filter.
|
40
|
+
#
|
41
|
+
# If you are subclassing this object, re-implement this function.
|
42
|
+
def self.from_basic_service_endpoint(endpoint)
|
43
|
+
return endpoint
|
44
|
+
end
|
45
|
+
|
46
|
+
# A hack to make both this class and its instances respond to
|
47
|
+
# this message since Ruby doesn't support static methods.
|
48
|
+
def from_basic_service_endpoint(endpoint)
|
49
|
+
return self.class.from_basic_service_endpoint(endpoint)
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
|
54
|
+
# Take a list of basic filters and makes a filter that
|
55
|
+
# transforms the basic filter into a top-level filter. This is
|
56
|
+
# mostly useful for the implementation of make_filter, which
|
57
|
+
# should only be needed for special cases or internal use by
|
58
|
+
# this library.
|
59
|
+
#
|
60
|
+
# This object is useful for creating simple filters for services
|
61
|
+
# that use one URI and are specified by one Type (we expect most
|
62
|
+
# Types will fit this paradigm).
|
63
|
+
#
|
64
|
+
# Creates a BasicServiceEndpoint object and apply the filter
|
65
|
+
# functions to it until one of them returns a value.
|
66
|
+
class TransformFilterMaker
|
67
|
+
attr_reader :filter_procs
|
68
|
+
|
69
|
+
# Initialize the filter maker's state
|
70
|
+
#
|
71
|
+
# filter_functions are the endpoint transformer
|
72
|
+
# Procs to apply to the basic endpoint. These are called in
|
73
|
+
# turn until one of them does not return nil, and the result
|
74
|
+
# of that transformer is returned.
|
75
|
+
def initialize(filter_procs)
|
76
|
+
@filter_procs = filter_procs
|
77
|
+
end
|
78
|
+
|
79
|
+
# Returns an array of endpoint objects produced by the
|
80
|
+
# filter procs.
|
81
|
+
def get_service_endpoints(yadis_url, service_element)
|
82
|
+
endpoints = []
|
83
|
+
|
84
|
+
# Do an expansion of the service element by xrd:Type and
|
85
|
+
# xrd:URI
|
86
|
+
Yadis::expand_service(service_element).each { |type_uris, uri, _|
|
87
|
+
# Create a basic endpoint object to represent this
|
88
|
+
# yadis_url, Service, Type, URI combination
|
89
|
+
endpoint = BasicServiceEndpoint.new(
|
90
|
+
yadis_url, type_uris, uri, service_element)
|
91
|
+
|
92
|
+
e = apply_filters(endpoint)
|
93
|
+
if !e.nil?
|
94
|
+
endpoints << e
|
95
|
+
end
|
96
|
+
}
|
97
|
+
return endpoints
|
98
|
+
end
|
99
|
+
|
100
|
+
def apply_filters(endpoint)
|
101
|
+
# Apply filter procs to an endpoint until one of them returns
|
102
|
+
# non-nil.
|
103
|
+
@filter_procs.each { |filter_proc|
|
104
|
+
e = filter_proc.call(endpoint)
|
105
|
+
if !e.nil?
|
106
|
+
# Once one of the filters has returned an endpoint, do not
|
107
|
+
# apply any more.
|
108
|
+
return e
|
109
|
+
end
|
110
|
+
}
|
111
|
+
|
112
|
+
return nil
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
class CompoundFilter
|
117
|
+
attr_reader :subfilters
|
118
|
+
|
119
|
+
# Create a new filter that applies a set of filters to an
|
120
|
+
# endpoint and collects their results.
|
121
|
+
def initialize(subfilters)
|
122
|
+
@subfilters = subfilters
|
123
|
+
end
|
124
|
+
|
125
|
+
# Generate all endpoint objects for all of the subfilters of
|
126
|
+
# this filter and return their concatenation.
|
127
|
+
def get_service_endpoints(yadis_url, service_element)
|
128
|
+
endpoints = []
|
129
|
+
@subfilters.each { |subfilter|
|
130
|
+
endpoints += subfilter.get_service_endpoints(yadis_url, service_element)
|
131
|
+
}
|
132
|
+
return endpoints
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Exception raised when something is not able to be turned into a
|
137
|
+
# filter
|
138
|
+
@@filter_type_error = TypeError.new(
|
139
|
+
'Expected a filter, an endpoint, a callable or a list of any of these.')
|
140
|
+
|
141
|
+
# Convert a filter-convertable thing into a filter
|
142
|
+
#
|
143
|
+
# parts should be a filter, an endpoint, a callable, or a list of
|
144
|
+
# any of these.
|
145
|
+
def self.make_filter(parts)
|
146
|
+
# Convert the parts into a list, and pass to mk_compound_filter
|
147
|
+
if parts.nil?
|
148
|
+
parts = [BasicServiceEndpoint]
|
149
|
+
end
|
150
|
+
|
151
|
+
if parts.is_a?(Array)
|
152
|
+
return mk_compound_filter(parts)
|
153
|
+
else
|
154
|
+
return mk_compound_filter([parts])
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# Create a filter out of a list of filter-like things
|
159
|
+
#
|
160
|
+
# Used by make_filter
|
161
|
+
#
|
162
|
+
# parts should be a list of things that can be passed to make_filter
|
163
|
+
def self.mk_compound_filter(parts)
|
164
|
+
|
165
|
+
if !parts.respond_to?('each')
|
166
|
+
raise TypeError, "#{parts.inspect} is not iterable"
|
167
|
+
end
|
168
|
+
|
169
|
+
# Separate into a list of callables and a list of filter objects
|
170
|
+
transformers = []
|
171
|
+
filters = []
|
172
|
+
parts.each { |subfilter|
|
173
|
+
if !subfilter.is_a?(Array)
|
174
|
+
# If it's not an iterable
|
175
|
+
if subfilter.respond_to?('get_service_endpoints')
|
176
|
+
# It's a full filter
|
177
|
+
filters << subfilter
|
178
|
+
elsif subfilter.respond_to?('from_basic_service_endpoint')
|
179
|
+
# It's an endpoint object, so put its endpoint conversion
|
180
|
+
# attribute into the list of endpoint transformers
|
181
|
+
transformers << subfilter.method('from_basic_service_endpoint')
|
182
|
+
elsif subfilter.respond_to?('call')
|
183
|
+
# It's a proc, so add it to the list of endpoint
|
184
|
+
# transformers
|
185
|
+
transformers << subfilter
|
186
|
+
else
|
187
|
+
raise @@filter_type_error
|
188
|
+
end
|
189
|
+
else
|
190
|
+
filters << mk_compound_filter(subfilter)
|
191
|
+
end
|
192
|
+
}
|
193
|
+
|
194
|
+
if transformers.length > 0
|
195
|
+
filters << TransformFilterMaker.new(transformers)
|
196
|
+
end
|
197
|
+
|
198
|
+
if filters.length == 1
|
199
|
+
return filters[0]
|
200
|
+
else
|
201
|
+
return CompoundFilter.new(filters)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
@@ -0,0 +1,305 @@
|
|
1
|
+
# = HTMLTokenizer
|
2
|
+
#
|
3
|
+
# Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com)
|
4
|
+
# Copyright:: Copyright (c) 2004 Ben Giddings
|
5
|
+
# License:: Distributes under the same terms as Ruby
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# This is a partial port of the functionality behind Perl's TokeParser
|
9
|
+
# Provided a page it progressively returns tokens from that page
|
10
|
+
#
|
11
|
+
# $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $
|
12
|
+
|
13
|
+
#
|
14
|
+
# A class to tokenize HTML.
|
15
|
+
#
|
16
|
+
# Example:
|
17
|
+
#
|
18
|
+
# page = "<HTML>
|
19
|
+
# <HEAD>
|
20
|
+
# <TITLE>This is the title</TITLE>
|
21
|
+
# </HEAD>
|
22
|
+
# <!-- Here comes the <a href=\"missing.link\">blah</a>
|
23
|
+
# comment body
|
24
|
+
# -->
|
25
|
+
# <BODY>
|
26
|
+
# <H1>This is the header</H1>
|
27
|
+
# <P>
|
28
|
+
# This is the paragraph, it contains
|
29
|
+
# <a href=\"link.html\">links</a>,
|
30
|
+
# <img src=\"blah.gif\" optional alt='images
|
31
|
+
# are
|
32
|
+
# really cool'>. Ok, here is some more text and
|
33
|
+
# <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
|
34
|
+
# </P>
|
35
|
+
# </body>
|
36
|
+
# </HTML>
|
37
|
+
# "
|
38
|
+
# toke = HTMLTokenizer.new(page)
|
39
|
+
#
|
40
|
+
# assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
|
41
|
+
# assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
|
42
|
+
# assert("links" == toke.getTrimmedText)
|
43
|
+
# assert(toke.getTag("IMG", "A").attr_hash['optional'])
|
44
|
+
# assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
|
45
|
+
#
|
46
|
+
class HTMLTokenizer
|
47
|
+
@@version = 1.0
|
48
|
+
|
49
|
+
# Get version of HTMLTokenizer lib
|
50
|
+
def self.version
|
51
|
+
@@version
|
52
|
+
end
|
53
|
+
|
54
|
+
attr_reader :page
|
55
|
+
|
56
|
+
# Create a new tokenizer, based on the content, used as a string.
|
57
|
+
def initialize(content)
|
58
|
+
@page = content.to_s
|
59
|
+
@cur_pos = 0
|
60
|
+
end
|
61
|
+
|
62
|
+
# Reset the parser, setting the current position back at the stop
|
63
|
+
def reset
|
64
|
+
@cur_pos = 0
|
65
|
+
end
|
66
|
+
|
67
|
+
# Look at the next token, but don't actually grab it
|
68
|
+
def peekNextToken
|
69
|
+
if @cur_pos == @page.length then return nil end
|
70
|
+
|
71
|
+
if ?< == @page[@cur_pos]
|
72
|
+
# Next token is a tag of some kind
|
73
|
+
if '!--' == @page[(@cur_pos + 1), 3]
|
74
|
+
# Token is a comment
|
75
|
+
tag_end = @page.index('-->', (@cur_pos + 1))
|
76
|
+
if tag_end.nil?
|
77
|
+
raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"
|
78
|
+
end
|
79
|
+
# p @page[@cur_pos .. (tag_end+2)]
|
80
|
+
HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
|
81
|
+
else
|
82
|
+
# Token is a html tag
|
83
|
+
tag_end = @page.index('>', (@cur_pos + 1))
|
84
|
+
if tag_end.nil?
|
85
|
+
raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"
|
86
|
+
end
|
87
|
+
# p @page[@cur_pos .. tag_end]
|
88
|
+
HTMLTag.new(@page[@cur_pos .. tag_end])
|
89
|
+
end
|
90
|
+
else
|
91
|
+
# Next token is text
|
92
|
+
text_end = @page.index('<', @cur_pos)
|
93
|
+
text_end = text_end.nil? ? -1 : (text_end - 1)
|
94
|
+
# p @page[@cur_pos .. text_end]
|
95
|
+
HTMLText.new(@page[@cur_pos .. text_end])
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Get the next token, returns an instance of
|
100
|
+
# * HTMLText
|
101
|
+
# * HTMLToken
|
102
|
+
# * HTMLTag
|
103
|
+
def getNextToken
|
104
|
+
token = peekNextToken
|
105
|
+
if token
|
106
|
+
# @page = @page[token.raw.length .. -1]
|
107
|
+
# @page.slice!(0, token.raw.length)
|
108
|
+
@cur_pos += token.raw.length
|
109
|
+
end
|
110
|
+
#p token
|
111
|
+
#print token.raw
|
112
|
+
return token
|
113
|
+
end
|
114
|
+
|
115
|
+
# Get a tag from the specified set of desired tags.
|
116
|
+
# For example:
|
117
|
+
# <tt>foo = toke.getTag("h1", "h2", "h3")</tt>
|
118
|
+
# Will return the next header tag encountered.
|
119
|
+
def getTag(*sought_tags)
|
120
|
+
sought_tags.collect! {|elm| elm.downcase}
|
121
|
+
|
122
|
+
while (tag = getNextToken)
|
123
|
+
if tag.kind_of?(HTMLTag) and
|
124
|
+
(0 == sought_tags.length or sought_tags.include?(tag.tag_name))
|
125
|
+
break
|
126
|
+
end
|
127
|
+
end
|
128
|
+
tag
|
129
|
+
end
|
130
|
+
|
131
|
+
# Get all the text between the current position and the next tag
|
132
|
+
# (if specified) or a specific later tag
|
133
|
+
def getText(until_tag = nil)
|
134
|
+
if until_tag.nil?
|
135
|
+
if ?< == @page[@cur_pos]
|
136
|
+
# Next token is a tag, not text
|
137
|
+
""
|
138
|
+
else
|
139
|
+
# Next token is text
|
140
|
+
getNextToken.text
|
141
|
+
end
|
142
|
+
else
|
143
|
+
ret_str = ""
|
144
|
+
|
145
|
+
while (tag = peekNextToken)
|
146
|
+
if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
|
147
|
+
break
|
148
|
+
end
|
149
|
+
|
150
|
+
if ("" != tag.text)
|
151
|
+
ret_str << (tag.text + " ")
|
152
|
+
end
|
153
|
+
getNextToken
|
154
|
+
end
|
155
|
+
|
156
|
+
ret_str
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Like getText, but squeeze all whitespace, getting rid of
|
161
|
+
# leading and trailing whitespace, and squeezing multiple
|
162
|
+
# spaces into a single space.
|
163
|
+
def getTrimmedText(until_tag = nil)
|
164
|
+
getText(until_tag).strip.gsub(/\s+/m, " ")
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
168
|
+
|
169
|
+
class HTMLTokenizerError < Exception
|
170
|
+
end
|
171
|
+
|
172
|
+
# The parent class for all three types of HTML tokens
|
173
|
+
class HTMLToken
|
174
|
+
attr_accessor :raw
|
175
|
+
|
176
|
+
# Initialize the token based on the raw text
|
177
|
+
def initialize(text)
|
178
|
+
@raw = text
|
179
|
+
end
|
180
|
+
|
181
|
+
# By default, return exactly the string used to create the text
|
182
|
+
def to_s
|
183
|
+
raw
|
184
|
+
end
|
185
|
+
|
186
|
+
# By default tokens have no text representation
|
187
|
+
def text
|
188
|
+
""
|
189
|
+
end
|
190
|
+
|
191
|
+
def trimmed_text
|
192
|
+
text.strip.gsub(/\s+/m, " ")
|
193
|
+
end
|
194
|
+
|
195
|
+
# Compare to another based on the raw source
|
196
|
+
def ==(other)
|
197
|
+
raw == other.to_s
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# Class representing text that isn't inside a tag
|
202
|
+
class HTMLText < HTMLToken
|
203
|
+
def text
|
204
|
+
raw
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# Class representing an HTML comment
|
209
|
+
class HTMLComment < HTMLToken
|
210
|
+
attr_accessor :contents
|
211
|
+
def initialize(text)
|
212
|
+
super(text)
|
213
|
+
temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
|
214
|
+
if temp_arr[0].nil?
|
215
|
+
raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
|
216
|
+
end
|
217
|
+
|
218
|
+
@contents = temp_arr[0][0]
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
# Class representing an HTML tag
|
223
|
+
class HTMLTag < HTMLToken
|
224
|
+
attr_reader :end_tag, :tag_name
|
225
|
+
def initialize(text)
|
226
|
+
super(text)
|
227
|
+
if ?< != text[0] or ?> != text[-1]
|
228
|
+
raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
|
229
|
+
end
|
230
|
+
|
231
|
+
@attr_hash = Hash.new
|
232
|
+
@raw = text
|
233
|
+
|
234
|
+
tag_name = text.scan(/[\w:-]+/)[0]
|
235
|
+
if tag_name.nil?
|
236
|
+
raise HTMLTokenizerError, "Error, tag is nil: #{tag_name}"
|
237
|
+
end
|
238
|
+
|
239
|
+
if ?/ == text[1]
|
240
|
+
# It's an end tag
|
241
|
+
@end_tag = true
|
242
|
+
@tag_name = '/' + tag_name.downcase
|
243
|
+
else
|
244
|
+
@end_tag = false
|
245
|
+
@tag_name = tag_name.downcase
|
246
|
+
end
|
247
|
+
|
248
|
+
@hashed = false
|
249
|
+
end
|
250
|
+
|
251
|
+
# Retrieve a hash of all the tag's attributes.
|
252
|
+
# Lazily done, so that if you don't look at a tag's attributes
|
253
|
+
# things go quicker
|
254
|
+
def attr_hash
|
255
|
+
# Lazy initialize == don't build the hash until it's needed
|
256
|
+
if !@hashed
|
257
|
+
if !@end_tag
|
258
|
+
# Get the attributes
|
259
|
+
attr_arr = @raw.scan(/<[\w:-]+\s+(.*?)\/?>/m)[0]
|
260
|
+
if attr_arr.kind_of?(Array)
|
261
|
+
# Attributes found, parse them
|
262
|
+
attrs = attr_arr[0]
|
263
|
+
attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
|
264
|
+
# clean up the array by:
|
265
|
+
# * setting all nil elements to true
|
266
|
+
# * removing enclosing quotes
|
267
|
+
attr_arr.each {
|
268
|
+
|item|
|
269
|
+
val = if item[1].nil?
|
270
|
+
item[0]
|
271
|
+
elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
|
272
|
+
item[1][1 .. -2]
|
273
|
+
else
|
274
|
+
item[1]
|
275
|
+
end
|
276
|
+
@attr_hash[item[0].downcase] = val
|
277
|
+
}
|
278
|
+
end
|
279
|
+
end
|
280
|
+
@hashed = true
|
281
|
+
end
|
282
|
+
|
283
|
+
#p self
|
284
|
+
|
285
|
+
@attr_hash
|
286
|
+
end
|
287
|
+
|
288
|
+
# Get the 'alt' text for a tag, if it exists, or an empty string otherwise
|
289
|
+
def text
|
290
|
+
if !end_tag
|
291
|
+
case tag_name
|
292
|
+
when 'img'
|
293
|
+
if !attr_hash['alt'].nil?
|
294
|
+
return attr_hash['alt']
|
295
|
+
end
|
296
|
+
when 'applet'
|
297
|
+
if !attr_hash['alt'].nil?
|
298
|
+
return attr_hash['alt']
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
302
|
+
return ''
|
303
|
+
end
|
304
|
+
end
|
305
|
+
|