siefca-httpage 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,224 @@
1
+ # = httpage/bufferaffects
2
+ #
3
+ # Author:: Paweł Wilk (mailto:pw@gnu.org)
4
+ # Copyright:: Copyright (c) 2009 Paweł Wilk
5
+ # License:: LGPL
6
+ #
7
+
8
+ # This module is intended to be used as extension
9
+ # (class level mixin) for classes using some buffers
10
+ # that may be altered by calling certain methods.
11
+ #
12
+ # It automates resetting of buffers by installing
13
+ # wrappers for invasive methods you choose. It rewrites
14
+ # selected methods by adding to them code that calls
15
+ # buffer(s) flushing method created by you.
16
+ #
17
+ # === Markers
18
+ #
19
+ # To select which methods are invasive for your buffer(s)
20
+ # you should use markers which in usage are similar to
21
+ # accessors, e.g:
22
+ #
23
+ # attr_affects_buffers :domain
24
+ #
25
+ # Markers may be placed anywhere in the class. Wrapping
26
+ # routine will wait for methods to be defined if you
27
+ # mark them too early in your code.
28
+ #
29
+ # ==== Marking methods
30
+ #
31
+ # To mark methods which should trigger reset operation
32
+ # when called use method_affects_buffers which takes
33
+ # comma-separated list of symbols describing names
34
+ # of these methods.
35
+ #
36
+ # ==== Marking attributes (setters)
37
+ #
38
+ # The marker attr_affects_buffers is similar but it takes
39
+ # instance members not methods as arguments. It just installs
40
+ # hooks for corresponding setters.
41
+ #
42
+ # === Buffers flushing method
43
+ #
44
+ # Default instance method called to reset buffers should be
45
+ # defined under name +reset_buffers+
46
+ # You may also want to set up your own name by calling
47
+ # buffers_reset_method class method. The name of your
48
+ # buffers flushing method is passed to subclasses but
49
+ # each subclass may redefine it.
50
+ #
51
+ # Be aware that sub-subclass
52
+ # will still need redefinition since it's kind of one-level
53
+ # inheritance.
54
+ #
55
+ # Buffers flushing method may take none or exactly one argument.
56
+ # If your method will take an argument then a name of calling
57
+ # method will be passed to it as symbol.
58
+ #
59
+ # === Inherited classes
60
+ #
61
+ # This module tries to be inheritance-safe but you will have to
62
+ # mark methods and members in subclasses if you are going
63
+ # to redefine them. The smooth way is of course to use +super+
64
+ # in overloaded methods so it will also do the job.
65
+ #
66
+ # === Caution
67
+ #
68
+ # This code uses Module#method_added hook. If you're going
69
+ # to redefine that method in class using this module remember
70
+ # to wrap and call original version or add one line to your
71
+ # definition: +ba_check_method(name)+
72
+ #
73
+ # === Example
74
+ #
75
+ # class Main
76
+ #
77
+ # extend BufferAffects
78
+ #
79
+ # buffers_reset_method :reset_path_buffer
80
+ # attr_affects_buffers :subpart
81
+ # attr_accessor :subpart, :otherpart
82
+ #
83
+ # def reset_path_buffer(name)
84
+ # @path = nil
85
+ # p "reset called for #{name}"
86
+ # end
87
+ #
88
+ # def path
89
+ # @path ||= @subpart.to_s + @otherpart.to_s
90
+ # end
91
+ #
92
+ # end
93
+ #
94
+ # obj = Main.new
95
+ # obj.subpart = 'test'
96
+ # p obj.path
97
+ # obj.subpart = '1234'
98
+ # p obj.path
99
+
100
+ module BufferAffects
101
+
102
+ @@__ba_wrapped__ = {}
103
+ @@__ba_reset_m__ = nil
104
+
105
+ # This method sets name of method that will be used to reset buffers.
106
+
107
+ def buffers_reset_method(name)
108
+ name = name.to_s.strip
109
+ raise ArgumentError.new('method name cannot be empty') if name.empty?
110
+ @__ba_reset_method__ = name.to_sym
111
+ @@__ba_reset_m__ ||= @__ba_reset_method__
112
+ end
113
+ private :buffers_reset_method
114
+
115
+ # This method sets the marker for hook to be installed.
116
+ # It ignores methods for which wrapper already exists.
117
+
118
+ def method_affects_buffers(*names)
119
+ @__ba_methods__ ||= {}
120
+ names.uniq!
121
+ names.collect! { |name| name.to_sym }
122
+ names.delete_if { |name| @__ba_methods__.has_key?(name) }
123
+ ba_methods_wrap(*names)
124
+ end
125
+ private :method_affects_buffers
126
+
127
+ # This method searches for setter methods for given
128
+ # member names and tries to wrap them into buffers
129
+ # resetting hooks usting method_affects_buffers
130
+
131
+ def attr_affects_buffers(*names)
132
+ names.collect! { |name| :"#{name}=" }
133
+ method_affects_buffers(*names)
134
+ end
135
+ private :attr_affects_buffers
136
+
137
+ # This method installs hook for given methods or puts their names
138
+ # on the queue if methods haven't been defined yet. The queue is
139
+ # tested each time ba_check_hook is called.
140
+ #
141
+ # Each processed method can be in one of 2 states:
142
+ # * false - method is not processed now
143
+ # * true - method is now processed
144
+ #
145
+ # After successful wrapping method name (key) and object ID (value) pairs
146
+ # are added two containers: @@__ba_wrapped__ and @__ba_methods__
147
+
148
+ def ba_methods_wrap(*names)
149
+ names.delete_if { |name| @__ba_methods__[name] == true } # don't handle methods being processed
150
+ kmethods = public_instance_methods +
151
+ private_instance_methods +
152
+ protected_instance_methods
153
+ install_now = names.select { |name| kmethods.include?(name) } # select methods for immediate wrapping
154
+ install_now.delete_if do |name| # but don't wrap already wrapped
155
+ @@__ba_wrapped__.has_key?(name) && # - wrapped by our class or other class
156
+ !@__ba_methods__.has_key?(name) # - not wrapped by our class
157
+ end
158
+
159
+ install_later = names - install_now # collect undefined and wrapped methods
160
+ install_later.each { |name| @__ba_methods__[name] = false } # and add them to the waiting queue
161
+
162
+ install_now.each { |name| @__ba_methods__[name] = true } # mark methods as currently processed
163
+ installed = ba_install_hook(*install_now) # and install hooks for them
164
+ install_now.each { |name| @__ba_methods__[name] = false } # mark methods as not processed again
165
+ installed.each_pair do |name,id| # and note the object IDs of wrapped methods
166
+ @@__ba_wrapped__[name] = id # shared container
167
+ @__ba_methods__[name] = id # this class's container
168
+ end
169
+ end
170
+ private :ba_methods_wrap
171
+
172
+ # This method checks whether method which name is given
173
+ # is now available and should be installed.
174
+
175
+ def ba_check_method(name)
176
+ name = name.to_sym
177
+ @__ba_methods__ ||= {}
178
+ if @__ba_methods__.has_key?(name)
179
+ ba_methods_wrap(name)
180
+ end
181
+ end
182
+ private :ba_check_method
183
+
184
+ # This method installs hook which alters given methods by wrapping
185
+ # them into method that invokes buffers resetting routine. It will
186
+ # not install hook for methods beginning with __ba, which signalizes
187
+ # that they are wrappers for other methods.
188
+
189
+ def ba_install_hook(*names)
190
+ @__ba_reset_method__ ||= @@__ba_reset_m__
191
+ @__ba_reset_method__ ||= 'reset_buffers'
192
+ installed = {}
193
+ names.uniq.each do |name|
194
+ new_method = name.to_s
195
+ next if new_method[0..3] == '__ba'
196
+ orig_id = instance_method(name.to_sym).object_id
197
+ orig_method = '__ba' + orig_id.to_s + '__'
198
+ reset_method = @__ba_reset_method__.to_s
199
+ module_eval %{
200
+ alias_method :#{orig_method}, :#{new_method}
201
+ private :#{orig_method}
202
+ def #{new_method}(*args, &block)
203
+ if method(:#{reset_method}).arity == 1
204
+ #{reset_method}(:#{new_method})
205
+ else
206
+ #{reset_method}
207
+ end
208
+ return #{orig_method}(*args, &block)
209
+ end
210
+ }
211
+ installed[name] = orig_id
212
+ end
213
+ return installed
214
+ end
215
+ private :ba_install_hook
216
+
217
+ # Hook that intercepts added methods.
218
+
219
+ def method_added(name)
220
+ ba_check_method(name)
221
+ end
222
+
223
+ end
224
+
@@ -0,0 +1,200 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require 'iconv'
5
+ require 'htmlentities'
6
+ require 'net/http'
7
+ require 'net/https'
8
+ require 'timeout'
9
+ require 'zlib'
10
+ require 'uri'
11
+
12
+ class HTTPage
13
+
14
+ extend BufferAffects
15
+
16
+ buffers_reset_method :reset_buffers
17
+ attr_affects_buffers :url, :encoding
18
+
19
+ attr_accessor :redir_retry, :conn_retry, :timeout, :url
20
+ attr_writer :encoding
21
+
22
+ def initialize(url,redir_retry=5,conn_retry=8,timeout=40)
23
+ @encoding = nil
24
+ @response = nil
25
+ @http_req = nil
26
+ @redir_retry = redir_retry
27
+ @conn_retry = conn_retry
28
+ @timeout = timeout
29
+ self.url = url
30
+ end
31
+
32
+ # Resets encoding and response buffers.
33
+
34
+ def reset_buffers
35
+ @encoding = nil
36
+ @response = nil
37
+ end
38
+
39
+ # Sets new url.
40
+
41
+ def url=(url)
42
+ @url = URI.parse(url) unless @url.kind_of? URI
43
+ @url.path = '/' if @url.path.nil? || @url.path.empty?
44
+ @http_req = Net::HTTP::Get.new(@url.path)
45
+ end
46
+
47
+ # Returns page encoding.
48
+
49
+ def encoding
50
+ @encoding ||= get_page_encoding
51
+ end
52
+
53
+ # Obtains encoding from document body or server response header.
54
+
55
+ def get_page_encoding(default_encoding='ascii')
56
+ return default_encoding if self.response.nil?
57
+
58
+ # try meta-tag header
59
+ header = self.response.body.scan(/<meta http-equiv\s*=\s*['"]*content-type['"]*\s*content\s*=\s*['"]*\s*(.*?)\s*['"]*\s*\/?>/i)
60
+ header = header.flatten.first
61
+ enc = extract_encoding(header)
62
+
63
+ # try server header
64
+ if enc.nil?
65
+ header = response.header['content-type']
66
+ enc = extract_encoding(header)
67
+ end
68
+
69
+ # try default
70
+ enc = default_encoding if enc.nil?
71
+
72
+ return enc
73
+ end
74
+ private :get_page_encoding
75
+
76
+ # Extracts enconding from content-type string.
77
+
78
+ def extract_encoding(enc_string)
79
+ return nil if enc_string.nil? || enc_string.empty?
80
+ ret_enc = nil
81
+ ct = enc_string.chomp.downcase.squeeze(' ')
82
+ unless ct.nil?
83
+ ctary = {}
84
+ ct.split(';').each do |segment|
85
+ k,v = segment.split('=')
86
+ ctary[k.strip.to_sym] = v unless (k.nil? || v.nil?)
87
+ end
88
+ if ctary.has_key?(:charset)
89
+ begin
90
+ test_enc = ctary[:charset]
91
+ test_enc = 'utf-8' if test_enc == 'utf8'
92
+ ret_enc = Encoding.find(test_enc)
93
+ ret_enc = ret_enc.name
94
+ rescue ArgumentError
95
+ end
96
+ end
97
+ end
98
+ ret_enc = nil if ret_enc.nil? || ret_enc.squeeze(" ").empty?
99
+ return ret_enc
100
+ end
101
+ private :extract_encoding
102
+
103
+ # Fetches document using HTTP and returns response object. It also sets encoding.
104
+
105
+ def response
106
+ return @response unless @response.nil?
107
+ found = false
108
+ response = nil
109
+ url = @url
110
+ http_req = @http_req
111
+ redir_retry = @redir_retry
112
+ conn_retry = @conn_retry
113
+
114
+ until found do
115
+ begin
116
+ status = Timeout::timeout(@timeout) do
117
+ case url.scheme.downcase.to_sym
118
+ when :http
119
+ response = Net::HTTP.start(url.host, url.port) { |http| http.request(http_req) }
120
+ when :https
121
+ https = Net::HTTP.new(url.host, url.port)
122
+ https.use_ssl = true
123
+ https.verify_mode = OpenSSL::SSL::VERIFY_NONE
124
+ response = https.start { |http| http.request(http_req) }
125
+ else
126
+ return nil
127
+ end
128
+ end
129
+ response.value
130
+ rescue Net::HTTPRetriableError
131
+ conn_retry -= 1
132
+ if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty?
133
+ url = URI.parse(response.header['location'])
134
+ http_req = Net::HTTP::Get.new(url.path)
135
+ redir_retry -= 1
136
+ end
137
+ rescue
138
+ return nil
139
+ end
140
+ if response.kind_of?(Net::HTTPOK)
141
+ found = true
142
+ break
143
+ end
144
+ break if (redir_retry < 0 || conn_retry < 0)
145
+ end
146
+ if found
147
+ @response = response
148
+ @encoding = get_page_encoding
149
+ return response
150
+ else
151
+ return nil
152
+ end
153
+ end
154
+
155
+ # Returns document body.
156
+
157
+ def body
158
+ r = self.response
159
+ return r.respond_to?(:body) ? r.body : nil
160
+ end
161
+
162
+ # Strips HTML tags from document.
163
+
164
+ def strip_html(text=nil)
165
+ text ||= self.body
166
+ coder=HTMLEntities.new
167
+ coder.decode(text.tr("\t", ' ').
168
+ tr("\r", '').
169
+ sub(%r{<body.*?>(.*?)</body>}mi, '\1').
170
+ gsub(%r{<script.*?>(.*?)</script>}mi, ' ').
171
+ gsub(%r{<style.*?>(.*?)</style>}mi, ' ').
172
+ gsub(%r{<!--.*?-->}mi, ' ').
173
+ gsub(/<br\s*\/?>|<p>/mi, "\n").
174
+ gsub(/<.*?>/m, ''))
175
+ end
176
+
177
+ # Transliterates text to ASCII and removes unknown characters.
178
+
179
+ def clean_text(text=nil, enc=nil)
180
+ text ||= self.body
181
+ enc ||= self.encoding
182
+ page = Iconv.iconv('UTF-8//IGNORE', enc, text).join
183
+ page = Iconv.iconv('ASCII//TRANSLIT//IGNORE', 'UTF-8', strip_html(page)).join.downcase
184
+ page.tr!(".!?", ' ')
185
+ page.gsub!(/[^\x00-\x7F]+/, '')
186
+ page.gsub!(/[^a-z0-9\-_\+\s\n\.\!\?]+/im, '')
187
+ page.gsub!(%r{[.*?]}mi, '')
188
+ page.squeeze!(" \n")
189
+ page.gsub!(/^\s?\n\s?$/m, '')
190
+ page.gsub!(/\n\s/,"\n")
191
+ page.gsub!(/\s\n/,"\n")
192
+ page.gsub!(/^\s+/,'')
193
+ page.squeeze!("\n ")
194
+ return page
195
+ end
196
+
197
+ def clean; clean_text end
198
+
199
+ end
200
+
data/lib/httpage.rb ADDED
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ #
3
+ # HTTP loading and transliteration
4
+ #
5
+ # Author:: Paweł Wilk (mailto:pw@gnu.org)
6
+ # Copyright:: Copyright (c) 2009 Paweł Wilk
7
+ # License:: LGPL
8
+
9
+ require 'httpage/bufferaffects'
10
+ require 'httpage/httpage'
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: siefca-httpage
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - "Pawe\xC5\x82 Wilk"
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-04-22 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: httpage is simple HTTP(S) reader with ability to transliterate body
17
+ email: pw@gnu.org
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/httpage.rb
26
+ - lib/httpage/httpage.rb
27
+ - lib/httpage/bufferaffects.rb
28
+ has_rdoc: true
29
+ homepage: http://randomseed.pl/httpage
30
+ post_install_message:
31
+ rdoc_options: []
32
+
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: "0"
40
+ version:
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ version:
47
+ requirements: []
48
+
49
+ rubyforge_project:
50
+ rubygems_version: 1.2.0
51
+ signing_key:
52
+ specification_version: 2
53
+ summary: httpage is simple HTTP(S) reader with ability to transliterate body
54
+ test_files: []
55
+