siefca-httpage 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,224 @@
1
+ # = httpage/bufferaffects
2
+ #
3
+ # Author:: Paweł Wilk (mailto:pw@gnu.org)
4
+ # Copyright:: Copyright (c) 2009 Paweł Wilk
5
+ # License:: LGPL
6
+ #
7
+
8
+ # This module is intended to be used as extension
9
+ # (class level mixin) for classes using some buffers
10
+ # that may be altered by calling certain methods.
11
+ #
12
+ # It automates resetting of buffers by installing
13
+ # wrappers for invasive methods you choose. It rewrites
14
+ # selected methods by adding to them code that calls
15
+ # buffer(s) flushing method created by you.
16
+ #
17
+ # === Markers
18
+ #
19
+ # To select which methods are invasive for your buffer(s)
20
+ # you should use markers which in usage are similar to
21
+ # accessors, e.g:
22
+ #
23
+ # attr_affects_buffers :domain
24
+ #
25
+ # Markers may be placed anywhere in the class. Wrapping
26
+ # routine will wait for methods to be defined if you
27
+ # mark them too early in your code.
28
+ #
29
+ # ==== Marking methods
30
+ #
31
+ # To mark methods which should trigger reset operation
32
+ # when called use method_affects_buffers which takes
33
+ # comma-separated list of symbols describing names
34
+ # of these methods.
35
+ #
36
+ # ==== Marking attributes (setters)
37
+ #
38
+ # The marker attr_affects_buffers is similar but it takes
39
+ # instance members not methods as arguments. It just installs
40
+ # hooks for corresponding setters.
41
+ #
42
+ # === Buffers flushing method
43
+ #
44
+ # Default instance method called to reset buffers should be
45
+ # defined under name +reset_buffers+
46
+ # You may also want to set up your own name by calling
47
+ # buffers_reset_method class method. The name of your
48
+ # buffers flushing method is passed to subclasses but
49
+ # each subclass may redefine it.
50
+ #
51
+ # Be aware that sub-subclass
52
+ # will still need redefinition since it's kind of one-level
53
+ # inheritance.
54
+ #
55
+ # Buffers flushing method may take none or exactly one argument.
56
+ # If your method will take an argument then a name of calling
57
+ # method will be passed to it as symbol.
58
+ #
59
+ # === Inherited classes
60
+ #
61
+ # This module tries to be inheritance-safe but you will have to
62
+ # mark methods and members in subclasses if you are going
63
+ # to redefine them. The smooth way is of course to use +super+
64
+ # in overloaded methods so it will also do the job.
65
+ #
66
+ # === Caution
67
+ #
68
+ # This code uses Module#method_added hook. If you're going
69
+ # to redefine that method in class using this module remember
70
+ # to wrap and call original version or add one line to your
71
+ # definition: +ba_check_method(name)+
72
+ #
73
+ # === Example
74
+ #
75
+ # class Main
76
+ #
77
+ # extend BufferAffects
78
+ #
79
+ # buffers_reset_method :reset_path_buffer
80
+ # attr_affects_buffers :subpart
81
+ # attr_accessor :subpart, :otherpart
82
+ #
83
+ # def reset_path_buffer(name)
84
+ # @path = nil
85
+ # p "reset called for #{name}"
86
+ # end
87
+ #
88
+ # def path
89
+ # @path ||= @subpart.to_s + @otherpart.to_s
90
+ # end
91
+ #
92
+ # end
93
+ #
94
+ # obj = Main.new
95
+ # obj.subpart = 'test'
96
+ # p obj.path
97
+ # obj.subpart = '1234'
98
+ # p obj.path
99
+
100
+ module BufferAffects
101
+
102
+ @@__ba_wrapped__ = {}
103
+ @@__ba_reset_m__ = nil
104
+
105
+ # This method sets name of method that will be used to reset buffers.
106
+
107
+ def buffers_reset_method(name)
108
+ name = name.to_s.strip
109
+ raise ArgumentError.new('method name cannot be empty') if name.empty?
110
+ @__ba_reset_method__ = name.to_sym
111
+ @@__ba_reset_m__ ||= @__ba_reset_method__
112
+ end
113
+ private :buffers_reset_method
114
+
115
+ # This method sets the marker for hook to be installed.
116
+ # It ignores methods for which wrapper already exists.
117
+
118
+ def method_affects_buffers(*names)
119
+ @__ba_methods__ ||= {}
120
+ names.uniq!
121
+ names.collect! { |name| name.to_sym }
122
+ names.delete_if { |name| @__ba_methods__.has_key?(name) }
123
+ ba_methods_wrap(*names)
124
+ end
125
+ private :method_affects_buffers
126
+
127
+ # This method searches for setter methods for given
128
+ # member names and tries to wrap them into buffers
129
+ # resetting hooks usting method_affects_buffers
130
+
131
+ def attr_affects_buffers(*names)
132
+ names.collect! { |name| :"#{name}=" }
133
+ method_affects_buffers(*names)
134
+ end
135
+ private :attr_affects_buffers
136
+
137
+ # This method installs hook for given methods or puts their names
138
+ # on the queue if methods haven't been defined yet. The queue is
139
+ # tested each time ba_check_hook is called.
140
+ #
141
+ # Each processed method can be in one of 2 states:
142
+ # * false - method is not processed now
143
+ # * true - method is now processed
144
+ #
145
+ # After successful wrapping method name (key) and object ID (value) pairs
146
+ # are added two containers: @@__ba_wrapped__ and @__ba_methods__
147
+
148
+ def ba_methods_wrap(*names)
149
+ names.delete_if { |name| @__ba_methods__[name] == true } # don't handle methods being processed
150
+ kmethods = public_instance_methods +
151
+ private_instance_methods +
152
+ protected_instance_methods
153
+ install_now = names.select { |name| kmethods.include?(name) } # select methods for immediate wrapping
154
+ install_now.delete_if do |name| # but don't wrap already wrapped
155
+ @@__ba_wrapped__.has_key?(name) && # - wrapped by our class or other class
156
+ !@__ba_methods__.has_key?(name) # - not wrapped by our class
157
+ end
158
+
159
+ install_later = names - install_now # collect undefined and wrapped methods
160
+ install_later.each { |name| @__ba_methods__[name] = false } # and add them to the waiting queue
161
+
162
+ install_now.each { |name| @__ba_methods__[name] = true } # mark methods as currently processed
163
+ installed = ba_install_hook(*install_now) # and install hooks for them
164
+ install_now.each { |name| @__ba_methods__[name] = false } # mark methods as not processed again
165
+ installed.each_pair do |name,id| # and note the object IDs of wrapped methods
166
+ @@__ba_wrapped__[name] = id # shared container
167
+ @__ba_methods__[name] = id # this class's container
168
+ end
169
+ end
170
+ private :ba_methods_wrap
171
+
172
+ # This method checks whether method which name is given
173
+ # is now available and should be installed.
174
+
175
+ def ba_check_method(name)
176
+ name = name.to_sym
177
+ @__ba_methods__ ||= {}
178
+ if @__ba_methods__.has_key?(name)
179
+ ba_methods_wrap(name)
180
+ end
181
+ end
182
+ private :ba_check_method
183
+
184
+ # This method installs hook which alters given methods by wrapping
185
+ # them into method that invokes buffers resetting routine. It will
186
+ # not install hook for methods beginning with __ba, which signalizes
187
+ # that they are wrappers for other methods.
188
+
189
+ def ba_install_hook(*names)
190
+ @__ba_reset_method__ ||= @@__ba_reset_m__
191
+ @__ba_reset_method__ ||= 'reset_buffers'
192
+ installed = {}
193
+ names.uniq.each do |name|
194
+ new_method = name.to_s
195
+ next if new_method[0..3] == '__ba'
196
+ orig_id = instance_method(name.to_sym).object_id
197
+ orig_method = '__ba' + orig_id.to_s + '__'
198
+ reset_method = @__ba_reset_method__.to_s
199
+ module_eval %{
200
+ alias_method :#{orig_method}, :#{new_method}
201
+ private :#{orig_method}
202
+ def #{new_method}(*args, &block)
203
+ if method(:#{reset_method}).arity == 1
204
+ #{reset_method}(:#{new_method})
205
+ else
206
+ #{reset_method}
207
+ end
208
+ return #{orig_method}(*args, &block)
209
+ end
210
+ }
211
+ installed[name] = orig_id
212
+ end
213
+ return installed
214
+ end
215
+ private :ba_install_hook
216
+
217
+ # Hook that intercepts added methods.
218
+
219
+ def method_added(name)
220
+ ba_check_method(name)
221
+ end
222
+
223
+ end
224
+
@@ -0,0 +1,200 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'iconv'
5
+ require 'htmlentities'
6
+ require 'net/http'
7
+ require 'net/https'
8
+ require 'timeout'
9
+ require 'zlib'
10
+ require 'uri'
11
+
12
+ class HTTPage
13
+
14
+ extend BufferAffects
15
+
16
+ buffers_reset_method :reset_buffers
17
+ attr_affects_buffers :url, :encoding
18
+
19
+ attr_accessor :redir_retry, :conn_retry, :timeout, :url
20
+ attr_writer :encoding
21
+
22
+ def initialize(url,redir_retry=5,conn_retry=8,timeout=40)
23
+ @encoding = nil
24
+ @response = nil
25
+ @http_req = nil
26
+ @redir_retry = redir_retry
27
+ @conn_retry = conn_retry
28
+ @timeout = timeout
29
+ self.url = url
30
+ end
31
+
32
+ # Resets encoding and response buffers.
33
+
34
+ def reset_buffers
35
+ @encoding = nil
36
+ @response = nil
37
+ end
38
+
39
+ # Sets new url.
40
+
41
+ def url=(url)
42
+ @url = URI.parse(url) unless @url.kind_of? URI
43
+ @url.path = '/' if @url.path.nil? || @url.path.empty?
44
+ @http_req = Net::HTTP::Get.new(@url.path)
45
+ end
46
+
47
+ # Returns page encoding.
48
+
49
+ def encoding
50
+ @encoding ||= get_page_encoding
51
+ end
52
+
53
+ # Obtains encoding from document body or server response header.
54
+
55
+ def get_page_encoding(default_encoding='ascii')
56
+ return default_encoding if self.response.nil?
57
+
58
+ # try meta-tag header
59
+ header = self.response.body.scan(/<meta http-equiv\s*=\s*['"]*content-type['"]*\s*content\s*=\s*['"]*\s*(.*?)\s*['"]*\s*\/?>/i)
60
+ header = header.flatten.first
61
+ enc = extract_encoding(header)
62
+
63
+ # try server header
64
+ if enc.nil?
65
+ header = response.header['content-type']
66
+ enc = extract_encoding(header)
67
+ end
68
+
69
+ # try default
70
+ enc = default_encoding if enc.nil?
71
+
72
+ return enc
73
+ end
74
+ private :get_page_encoding
75
+
76
+ # Extracts enconding from content-type string.
77
+
78
+ def extract_encoding(enc_string)
79
+ return nil if enc_string.nil? || enc_string.empty?
80
+ ret_enc = nil
81
+ ct = enc_string.chomp.downcase.squeeze(' ')
82
+ unless ct.nil?
83
+ ctary = {}
84
+ ct.split(';').each do |segment|
85
+ k,v = segment.split('=')
86
+ ctary[k.strip.to_sym] = v unless (k.nil? || v.nil?)
87
+ end
88
+ if ctary.has_key?(:charset)
89
+ begin
90
+ test_enc = ctary[:charset]
91
+ test_enc = 'utf-8' if test_enc == 'utf8'
92
+ ret_enc = Encoding.find(test_enc)
93
+ ret_enc = ret_enc.name
94
+ rescue ArgumentError
95
+ end
96
+ end
97
+ end
98
+ ret_enc = nil if ret_enc.nil? || ret_enc.squeeze(" ").empty?
99
+ return ret_enc
100
+ end
101
+ private :extract_encoding
102
+
103
+ # Fetches document using HTTP and returns response object. It also sets encoding.
104
+
105
+ def response
106
+ return @response unless @response.nil?
107
+ found = false
108
+ response = nil
109
+ url = @url
110
+ http_req = @http_req
111
+ redir_retry = @redir_retry
112
+ conn_retry = @conn_retry
113
+
114
+ until found do
115
+ begin
116
+ status = Timeout::timeout(@timeout) do
117
+ case url.scheme.downcase.to_sym
118
+ when :http
119
+ response = Net::HTTP.start(url.host, url.port) { |http| http.request(http_req) }
120
+ when :https
121
+ https = Net::HTTP.new(url.host, url.port)
122
+ https.use_ssl = true
123
+ https.verify_mode = OpenSSL::SSL::VERIFY_NONE
124
+ response = https.start { |http| http.request(http_req) }
125
+ else
126
+ return nil
127
+ end
128
+ end
129
+ response.value
130
+ rescue Net::HTTPRetriableError
131
+ conn_retry -= 1
132
+ if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty?
133
+ url = URI.parse(response.header['location'])
134
+ http_req = Net::HTTP::Get.new(url.path)
135
+ redir_retry -= 1
136
+ end
137
+ rescue
138
+ return nil
139
+ end
140
+ if response.kind_of?(Net::HTTPOK)
141
+ found = true
142
+ break
143
+ end
144
+ break if (redir_retry < 0 || conn_retry < 0)
145
+ end
146
+ if found
147
+ @response = response
148
+ @encoding = get_page_encoding
149
+ return response
150
+ else
151
+ return nil
152
+ end
153
+ end
154
+
155
+ # Returns document body.
156
+
157
+ def body
158
+ r = self.response
159
+ return r.respond_to?(:body) ? r.body : nil
160
+ end
161
+
162
+ # Strips HTML tags from document.
163
+
164
+ def strip_html(text=nil)
165
+ text ||= self.body
166
+ coder=HTMLEntities.new
167
+ coder.decode(text.tr("\t", ' ').
168
+ tr("\r", '').
169
+ sub(%r{<body.*?>(.*?)</body>}mi, '\1').
170
+ gsub(%r{<script.*?>(.*?)</script>}mi, ' ').
171
+ gsub(%r{<style.*?>(.*?)</style>}mi, ' ').
172
+ gsub(%r{<!--.*?-->}mi, ' ').
173
+ gsub(/<br\s*\/?>|<p>/mi, "\n").
174
+ gsub(/<.*?>/m, ''))
175
+ end
176
+
177
+ # Transliterates text to ASCII and removes unknown characters.
178
+
179
+ def clean_text(text=nil, enc=nil)
180
+ text ||= self.body
181
+ enc ||= self.encoding
182
+ page = Iconv.iconv('UTF-8//IGNORE', enc, text).join
183
+ page = Iconv.iconv('ASCII//TRANSLIT//IGNORE', 'UTF-8', strip_html(page)).join.downcase
184
+ page.tr!(".!?", ' ')
185
+ page.gsub!(/[^\x00-\x7F]+/, '')
186
+ page.gsub!(/[^a-z0-9\-_\+\s\n\.\!\?]+/im, '')
187
+ page.gsub!(%r{[.*?]}mi, '')
188
+ page.squeeze!(" \n")
189
+ page.gsub!(/^\s?\n\s?$/m, '')
190
+ page.gsub!(/\n\s/,"\n")
191
+ page.gsub!(/\s\n/,"\n")
192
+ page.gsub!(/^\s+/,'')
193
+ page.squeeze!("\n ")
194
+ return page
195
+ end
196
+
197
+ def clean; clean_text end
198
+
199
+ end
200
+
data/lib/httpage.rb ADDED
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ #
3
+ # HTTP loading and transliteration
4
+ #
5
+ # Author:: Paweł Wilk (mailto:pw@gnu.org)
6
+ # Copyright:: Copyright (c) 2009 Paweł Wilk
7
+ # License:: LGPL
8
+
9
+ require 'httpage/bufferaffects'
10
+ require 'httpage/httpage'
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: siefca-httpage
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - "Pawe\xC5\x82 Wilk"
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-04-22 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: httpage is simple HTTP(S) reader with ability to transliterate body
17
+ email: pw@gnu.org
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/httpage.rb
26
+ - lib/httpage/httpage.rb
27
+ - lib/httpage/bufferaffects.rb
28
+ has_rdoc: true
29
+ homepage: http://randomseed.pl/httpage
30
+ post_install_message:
31
+ rdoc_options: []
32
+
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: "0"
40
+ version:
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ version:
47
+ requirements: []
48
+
49
+ rubyforge_project:
50
+ rubygems_version: 1.2.0
51
+ signing_key:
52
+ specification_version: 2
53
+ summary: httpage is simple HTTP(S) reader with ability to transliterate body
54
+ test_files: []
55
+