siefca-httpage 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/httpage/bufferaffects.rb +224 -0
- data/lib/httpage/httpage.rb +200 -0
- data/lib/httpage.rb +10 -0
- metadata +55 -0
@@ -0,0 +1,224 @@
|
|
1
|
+
# = httpage/bufferaffects
|
2
|
+
#
|
3
|
+
# Author:: Paweł Wilk (mailto:pw@gnu.org)
|
4
|
+
# Copyright:: Copyright (c) 2009 Paweł Wilk
|
5
|
+
# License:: LGPL
|
6
|
+
#
|
7
|
+
|
8
|
+
# This module is intended to be used as extension
|
9
|
+
# (class level mixin) for classes using some buffers
|
10
|
+
# that may be altered by calling certain methods.
|
11
|
+
#
|
12
|
+
# It automates resetting of buffers by installing
|
13
|
+
# wrappers for invasive methods you choose. It rewrites
|
14
|
+
# selected methods by adding to them code that calls
|
15
|
+
# buffer(s) flushing method created by you.
|
16
|
+
#
|
17
|
+
# === Markers
|
18
|
+
#
|
19
|
+
# To select which methods are invasive for your buffer(s)
|
20
|
+
# you should use markers which in usage are similar to
|
21
|
+
# accessors, e.g:
|
22
|
+
#
|
23
|
+
# attr_affects_buffers :domain
|
24
|
+
#
|
25
|
+
# Markers may be placed anywhere in the class. Wrapping
|
26
|
+
# routine will wait for methods to be defined if you
|
27
|
+
# mark them too early in your code.
|
28
|
+
#
|
29
|
+
# ==== Marking methods
|
30
|
+
#
|
31
|
+
# To mark methods which should trigger reset operation
|
32
|
+
# when called use method_affects_buffers which takes
|
33
|
+
# comma-separated list of symbols describing names
|
34
|
+
# of these methods.
|
35
|
+
#
|
36
|
+
# ==== Marking attributes (setters)
|
37
|
+
#
|
38
|
+
# The marker attr_affects_buffers is similar but it takes
|
39
|
+
# instance members not methods as arguments. It just installs
|
40
|
+
# hooks for corresponding setters.
|
41
|
+
#
|
42
|
+
# === Buffers flushing method
|
43
|
+
#
|
44
|
+
# Default instance method called to reset buffers should be
|
45
|
+
# defined under name +reset_buffers+
|
46
|
+
# You may also want to set up your own name by calling
|
47
|
+
# buffers_reset_method class method. The name of your
|
48
|
+
# buffers flushing method is passed to subclasses but
|
49
|
+
# each subclass may redefine it.
|
50
|
+
#
|
51
|
+
# Be aware that sub-subclass
|
52
|
+
# will still need redefinition since it's kind of one-level
|
53
|
+
# inheritance.
|
54
|
+
#
|
55
|
+
# Buffers flushing method may take none or exactly one argument.
|
56
|
+
# If your method will take an argument then a name of calling
|
57
|
+
# method will be passed to it as symbol.
|
58
|
+
#
|
59
|
+
# === Inherited classes
|
60
|
+
#
|
61
|
+
# This module tries to be inheritance-safe but you will have to
|
62
|
+
# mark methods and members in subclasses if you are going
|
63
|
+
# to redefine them. The smooth way is of course to use +super+
|
64
|
+
# in overloaded methods so it will also do the job.
|
65
|
+
#
|
66
|
+
# === Caution
|
67
|
+
#
|
68
|
+
# This code uses Module#method_added hook. If you're going
|
69
|
+
# to redefine that method in class using this module remember
|
70
|
+
# to wrap and call original version or add one line to your
|
71
|
+
# definition: +ba_check_method(name)+
|
72
|
+
#
|
73
|
+
# === Example
|
74
|
+
#
|
75
|
+
# class Main
|
76
|
+
#
|
77
|
+
# extend BufferAffects
|
78
|
+
#
|
79
|
+
# buffers_reset_method :reset_path_buffer
|
80
|
+
# attr_affects_buffers :subpart
|
81
|
+
# attr_accessor :subpart, :otherpart
|
82
|
+
#
|
83
|
+
# def reset_path_buffer(name)
|
84
|
+
# @path = nil
|
85
|
+
# p "reset called for #{name}"
|
86
|
+
# end
|
87
|
+
#
|
88
|
+
# def path
|
89
|
+
# @path ||= @subpart.to_s + @otherpart.to_s
|
90
|
+
# end
|
91
|
+
#
|
92
|
+
# end
|
93
|
+
#
|
94
|
+
# obj = Main.new
|
95
|
+
# obj.subpart = 'test'
|
96
|
+
# p obj.path
|
97
|
+
# obj.subpart = '1234'
|
98
|
+
# p obj.path
|
99
|
+
|
100
|
+
module BufferAffects
|
101
|
+
|
102
|
+
@@__ba_wrapped__ = {}
|
103
|
+
@@__ba_reset_m__ = nil
|
104
|
+
|
105
|
+
# This method sets name of method that will be used to reset buffers.
|
106
|
+
|
107
|
+
def buffers_reset_method(name)
|
108
|
+
name = name.to_s.strip
|
109
|
+
raise ArgumentError.new('method name cannot be empty') if name.empty?
|
110
|
+
@__ba_reset_method__ = name.to_sym
|
111
|
+
@@__ba_reset_m__ ||= @__ba_reset_method__
|
112
|
+
end
|
113
|
+
private :buffers_reset_method
|
114
|
+
|
115
|
+
# This method sets the marker for hook to be installed.
|
116
|
+
# It ignores methods for which wrapper already exists.
|
117
|
+
|
118
|
+
def method_affects_buffers(*names)
|
119
|
+
@__ba_methods__ ||= {}
|
120
|
+
names.uniq!
|
121
|
+
names.collect! { |name| name.to_sym }
|
122
|
+
names.delete_if { |name| @__ba_methods__.has_key?(name) }
|
123
|
+
ba_methods_wrap(*names)
|
124
|
+
end
|
125
|
+
private :method_affects_buffers
|
126
|
+
|
127
|
+
# This method searches for setter methods for given
|
128
|
+
# member names and tries to wrap them into buffers
|
129
|
+
# resetting hooks usting method_affects_buffers
|
130
|
+
|
131
|
+
def attr_affects_buffers(*names)
|
132
|
+
names.collect! { |name| :"#{name}=" }
|
133
|
+
method_affects_buffers(*names)
|
134
|
+
end
|
135
|
+
private :attr_affects_buffers
|
136
|
+
|
137
|
+
# This method installs hook for given methods or puts their names
|
138
|
+
# on the queue if methods haven't been defined yet. The queue is
|
139
|
+
# tested each time ba_check_hook is called.
|
140
|
+
#
|
141
|
+
# Each processed method can be in one of 2 states:
|
142
|
+
# * false - method is not processed now
|
143
|
+
# * true - method is now processed
|
144
|
+
#
|
145
|
+
# After successful wrapping method name (key) and object ID (value) pairs
|
146
|
+
# are added two containers: @@__ba_wrapped__ and @__ba_methods__
|
147
|
+
|
148
|
+
def ba_methods_wrap(*names)
|
149
|
+
names.delete_if { |name| @__ba_methods__[name] == true } # don't handle methods being processed
|
150
|
+
kmethods = public_instance_methods +
|
151
|
+
private_instance_methods +
|
152
|
+
protected_instance_methods
|
153
|
+
install_now = names.select { |name| kmethods.include?(name) } # select methods for immediate wrapping
|
154
|
+
install_now.delete_if do |name| # but don't wrap already wrapped
|
155
|
+
@@__ba_wrapped__.has_key?(name) && # - wrapped by our class or other class
|
156
|
+
!@__ba_methods__.has_key?(name) # - not wrapped by our class
|
157
|
+
end
|
158
|
+
|
159
|
+
install_later = names - install_now # collect undefined and wrapped methods
|
160
|
+
install_later.each { |name| @__ba_methods__[name] = false } # and add them to the waiting queue
|
161
|
+
|
162
|
+
install_now.each { |name| @__ba_methods__[name] = true } # mark methods as currently processed
|
163
|
+
installed = ba_install_hook(*install_now) # and install hooks for them
|
164
|
+
install_now.each { |name| @__ba_methods__[name] = false } # mark methods as not processed again
|
165
|
+
installed.each_pair do |name,id| # and note the object IDs of wrapped methods
|
166
|
+
@@__ba_wrapped__[name] = id # shared container
|
167
|
+
@__ba_methods__[name] = id # this class's container
|
168
|
+
end
|
169
|
+
end
|
170
|
+
private :ba_methods_wrap
|
171
|
+
|
172
|
+
# This method checks whether method which name is given
|
173
|
+
# is now available and should be installed.
|
174
|
+
|
175
|
+
def ba_check_method(name)
|
176
|
+
name = name.to_sym
|
177
|
+
@__ba_methods__ ||= {}
|
178
|
+
if @__ba_methods__.has_key?(name)
|
179
|
+
ba_methods_wrap(name)
|
180
|
+
end
|
181
|
+
end
|
182
|
+
private :ba_check_method
|
183
|
+
|
184
|
+
# This method installs hook which alters given methods by wrapping
|
185
|
+
# them into method that invokes buffers resetting routine. It will
|
186
|
+
# not install hook for methods beginning with __ba, which signalizes
|
187
|
+
# that they are wrappers for other methods.
|
188
|
+
|
189
|
+
def ba_install_hook(*names)
|
190
|
+
@__ba_reset_method__ ||= @@__ba_reset_m__
|
191
|
+
@__ba_reset_method__ ||= 'reset_buffers'
|
192
|
+
installed = {}
|
193
|
+
names.uniq.each do |name|
|
194
|
+
new_method = name.to_s
|
195
|
+
next if new_method[0..3] == '__ba'
|
196
|
+
orig_id = instance_method(name.to_sym).object_id
|
197
|
+
orig_method = '__ba' + orig_id.to_s + '__'
|
198
|
+
reset_method = @__ba_reset_method__.to_s
|
199
|
+
module_eval %{
|
200
|
+
alias_method :#{orig_method}, :#{new_method}
|
201
|
+
private :#{orig_method}
|
202
|
+
def #{new_method}(*args, &block)
|
203
|
+
if method(:#{reset_method}).arity == 1
|
204
|
+
#{reset_method}(:#{new_method})
|
205
|
+
else
|
206
|
+
#{reset_method}
|
207
|
+
end
|
208
|
+
return #{orig_method}(*args, &block)
|
209
|
+
end
|
210
|
+
}
|
211
|
+
installed[name] = orig_id
|
212
|
+
end
|
213
|
+
return installed
|
214
|
+
end
|
215
|
+
private :ba_install_hook
|
216
|
+
|
217
|
+
# Hook that intercepts added methods.
|
218
|
+
|
219
|
+
def method_added(name)
|
220
|
+
ba_check_method(name)
|
221
|
+
end
|
222
|
+
|
223
|
+
end
|
224
|
+
|
@@ -0,0 +1,200 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'iconv'
|
5
|
+
require 'htmlentities'
|
6
|
+
require 'net/http'
|
7
|
+
require 'net/https'
|
8
|
+
require 'timeout'
|
9
|
+
require 'zlib'
|
10
|
+
require 'uri'
|
11
|
+
|
12
|
+
class HTTPage
|
13
|
+
|
14
|
+
extend BufferAffects
|
15
|
+
|
16
|
+
buffers_reset_method :reset_buffers
|
17
|
+
attr_affects_buffers :url, :encoding
|
18
|
+
|
19
|
+
attr_accessor :redir_retry, :conn_retry, :timeout, :url
|
20
|
+
attr_writer :encoding
|
21
|
+
|
22
|
+
def initialize(url,redir_retry=5,conn_retry=8,timeout=40)
|
23
|
+
@encoding = nil
|
24
|
+
@response = nil
|
25
|
+
@http_req = nil
|
26
|
+
@redir_retry = redir_retry
|
27
|
+
@conn_retry = conn_retry
|
28
|
+
@timeout = timeout
|
29
|
+
self.url = url
|
30
|
+
end
|
31
|
+
|
32
|
+
# Resets encoding and response buffers.
|
33
|
+
|
34
|
+
def reset_buffers
|
35
|
+
@encoding = nil
|
36
|
+
@response = nil
|
37
|
+
end
|
38
|
+
|
39
|
+
# Sets new url.
|
40
|
+
|
41
|
+
def url=(url)
|
42
|
+
@url = URI.parse(url) unless @url.kind_of? URI
|
43
|
+
@url.path = '/' if @url.path.nil? || @url.path.empty?
|
44
|
+
@http_req = Net::HTTP::Get.new(@url.path)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Returns page encoding.
|
48
|
+
|
49
|
+
def encoding
|
50
|
+
@encoding ||= get_page_encoding
|
51
|
+
end
|
52
|
+
|
53
|
+
# Obtains encoding from document body or server response header.
|
54
|
+
|
55
|
+
def get_page_encoding(default_encoding='ascii')
|
56
|
+
return default_encoding if self.response.nil?
|
57
|
+
|
58
|
+
# try meta-tag header
|
59
|
+
header = self.response.body.scan(/<meta http-equiv\s*=\s*['"]*content-type['"]*\s*content\s*=\s*['"]*\s*(.*?)\s*['"]*\s*\/?>/i)
|
60
|
+
header = header.flatten.first
|
61
|
+
enc = extract_encoding(header)
|
62
|
+
|
63
|
+
# try server header
|
64
|
+
if enc.nil?
|
65
|
+
header = response.header['content-type']
|
66
|
+
enc = extract_encoding(header)
|
67
|
+
end
|
68
|
+
|
69
|
+
# try default
|
70
|
+
enc = default_encoding if enc.nil?
|
71
|
+
|
72
|
+
return enc
|
73
|
+
end
|
74
|
+
private :get_page_encoding
|
75
|
+
|
76
|
+
# Extracts enconding from content-type string.
|
77
|
+
|
78
|
+
def extract_encoding(enc_string)
|
79
|
+
return nil if enc_string.nil? || enc_string.empty?
|
80
|
+
ret_enc = nil
|
81
|
+
ct = enc_string.chomp.downcase.squeeze(' ')
|
82
|
+
unless ct.nil?
|
83
|
+
ctary = {}
|
84
|
+
ct.split(';').each do |segment|
|
85
|
+
k,v = segment.split('=')
|
86
|
+
ctary[k.strip.to_sym] = v unless (k.nil? || v.nil?)
|
87
|
+
end
|
88
|
+
if ctary.has_key?(:charset)
|
89
|
+
begin
|
90
|
+
test_enc = ctary[:charset]
|
91
|
+
test_enc = 'utf-8' if test_enc == 'utf8'
|
92
|
+
ret_enc = Encoding.find(test_enc)
|
93
|
+
ret_enc = ret_enc.name
|
94
|
+
rescue ArgumentError
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
ret_enc = nil if ret_enc.nil? || ret_enc.squeeze(" ").empty?
|
99
|
+
return ret_enc
|
100
|
+
end
|
101
|
+
private :extract_encoding
|
102
|
+
|
103
|
+
# Fetches document using HTTP and returns response object. It also sets encoding.
|
104
|
+
|
105
|
+
def response
|
106
|
+
return @response unless @response.nil?
|
107
|
+
found = false
|
108
|
+
response = nil
|
109
|
+
url = @url
|
110
|
+
http_req = @http_req
|
111
|
+
redir_retry = @redir_retry
|
112
|
+
conn_retry = @conn_retry
|
113
|
+
|
114
|
+
until found do
|
115
|
+
begin
|
116
|
+
status = Timeout::timeout(@timeout) do
|
117
|
+
case url.scheme.downcase.to_sym
|
118
|
+
when :http
|
119
|
+
response = Net::HTTP.start(url.host, url.port) { |http| http.request(http_req) }
|
120
|
+
when :https
|
121
|
+
https = Net::HTTP.new(url.host, url.port)
|
122
|
+
https.use_ssl = true
|
123
|
+
https.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
124
|
+
response = https.start { |http| http.request(http_req) }
|
125
|
+
else
|
126
|
+
return nil
|
127
|
+
end
|
128
|
+
end
|
129
|
+
response.value
|
130
|
+
rescue Net::HTTPRetriableError
|
131
|
+
conn_retry -= 1
|
132
|
+
if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty?
|
133
|
+
url = URI.parse(response.header['location'])
|
134
|
+
http_req = Net::HTTP::Get.new(url.path)
|
135
|
+
redir_retry -= 1
|
136
|
+
end
|
137
|
+
rescue
|
138
|
+
return nil
|
139
|
+
end
|
140
|
+
if response.kind_of?(Net::HTTPOK)
|
141
|
+
found = true
|
142
|
+
break
|
143
|
+
end
|
144
|
+
break if (redir_retry < 0 || conn_retry < 0)
|
145
|
+
end
|
146
|
+
if found
|
147
|
+
@response = response
|
148
|
+
@encoding = get_page_encoding
|
149
|
+
return response
|
150
|
+
else
|
151
|
+
return nil
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
# Returns document body.
|
156
|
+
|
157
|
+
def body
|
158
|
+
r = self.response
|
159
|
+
return r.respond_to?(:body) ? r.body : nil
|
160
|
+
end
|
161
|
+
|
162
|
+
# Strips HTML tags from document.
|
163
|
+
|
164
|
+
def strip_html(text=nil)
|
165
|
+
text ||= self.body
|
166
|
+
coder=HTMLEntities.new
|
167
|
+
coder.decode(text.tr("\t", ' ').
|
168
|
+
tr("\r", '').
|
169
|
+
sub(%r{<body.*?>(.*?)</body>}mi, '\1').
|
170
|
+
gsub(%r{<script.*?>(.*?)</script>}mi, ' ').
|
171
|
+
gsub(%r{<style.*?>(.*?)</style>}mi, ' ').
|
172
|
+
gsub(%r{<!--.*?-->}mi, ' ').
|
173
|
+
gsub(/<br\s*\/?>|<p>/mi, "\n").
|
174
|
+
gsub(/<.*?>/m, ''))
|
175
|
+
end
|
176
|
+
|
177
|
+
# Transliterates text to ASCII and removes unknown characters.
|
178
|
+
|
179
|
+
def clean_text(text=nil, enc=nil)
|
180
|
+
text ||= self.body
|
181
|
+
enc ||= self.encoding
|
182
|
+
page = Iconv.iconv('UTF-8//IGNORE', enc, text).join
|
183
|
+
page = Iconv.iconv('ASCII//TRANSLIT//IGNORE', 'UTF-8', strip_html(page)).join.downcase
|
184
|
+
page.tr!(".!?", ' ')
|
185
|
+
page.gsub!(/[^\x00-\x7F]+/, '')
|
186
|
+
page.gsub!(/[^a-z0-9\-_\+\s\n\.\!\?]+/im, '')
|
187
|
+
page.gsub!(%r{[.*?]}mi, '')
|
188
|
+
page.squeeze!(" \n")
|
189
|
+
page.gsub!(/^\s?\n\s?$/m, '')
|
190
|
+
page.gsub!(/\n\s/,"\n")
|
191
|
+
page.gsub!(/\s\n/,"\n")
|
192
|
+
page.gsub!(/^\s+/,'')
|
193
|
+
page.squeeze!("\n ")
|
194
|
+
return page
|
195
|
+
end
|
196
|
+
|
197
|
+
def clean; clean_text end
|
198
|
+
|
199
|
+
end
|
200
|
+
|
data/lib/httpage.rb
ADDED
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: siefca-httpage
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- "Pawe\xC5\x82 Wilk"
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-04-22 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: httpage is simple HTTP(S) reader with ability to transliterate body
|
17
|
+
email: pw@gnu.org
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- lib/httpage.rb
|
26
|
+
- lib/httpage/httpage.rb
|
27
|
+
- lib/httpage/bufferaffects.rb
|
28
|
+
has_rdoc: true
|
29
|
+
homepage: http://randomseed.pl/httpage
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
|
33
|
+
require_paths:
|
34
|
+
- lib
|
35
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: "0"
|
40
|
+
version:
|
41
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
42
|
+
requirements:
|
43
|
+
- - ">="
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: "0"
|
46
|
+
version:
|
47
|
+
requirements: []
|
48
|
+
|
49
|
+
rubyforge_project:
|
50
|
+
rubygems_version: 1.2.0
|
51
|
+
signing_key:
|
52
|
+
specification_version: 2
|
53
|
+
summary: httpage is simple HTTP(S) reader with ability to transliterate body
|
54
|
+
test_files: []
|
55
|
+
|