siefca-httpage 0.0.8 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/httpage/httpage.rb +11 -12
- data/lib/httpage.rb +10 -1
- metadata +13 -4
- data/lib/httpage/bufferaffects.rb +0 -224
data/lib/httpage/httpage.rb
CHANGED
@@ -1,17 +1,9 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# encoding: utf-8
|
3
3
|
|
4
|
-
require 'iconv'
|
5
|
-
require 'htmlentities'
|
6
|
-
require 'net/http'
|
7
|
-
require 'net/https'
|
8
|
-
require 'timeout'
|
9
|
-
require 'zlib'
|
10
|
-
require 'uri'
|
11
|
-
|
12
4
|
class HTTPage
|
13
5
|
|
14
|
-
|
6
|
+
include BufferAffects
|
15
7
|
|
16
8
|
buffers_reset_method :reset_buffers
|
17
9
|
attr_affects_buffers :url, :encoding
|
@@ -174,7 +166,7 @@ class HTTPage
|
|
174
166
|
gsub(/<.*?>/m, ''))
|
175
167
|
end
|
176
168
|
|
177
|
-
# Transliterates text to ASCII and removes unknown characters
|
169
|
+
# Transliterates text to ASCII and removes unknown characters.
|
178
170
|
|
179
171
|
def clean_text(text=nil, enc=nil)
|
180
172
|
text ||= self.body
|
@@ -185,9 +177,8 @@ class HTTPage
|
|
185
177
|
page = Iconv.iconv('ASCII//TRANSLIT//IGNORE', 'UTF-8', page).join.downcase
|
186
178
|
page.tr!(".!?", ' ')
|
187
179
|
page.gsub!(/[^\x00-\x7F]+/, '')
|
188
|
-
page.gsub!(/[^a-z0-9\-_
|
180
|
+
page.gsub!(/[^a-z0-9\-_\[\]\(\)\*\=\@\#\$\%\^\&\{\}\:\;\,\<\>\+\s\n\.\!\?]+/im, '')
|
189
181
|
page.gsub!('_amp__',"'")
|
190
|
-
page.gsub!(%r{[.*?]}mi, '')
|
191
182
|
page.squeeze!(" \n")
|
192
183
|
page.gsub!(/^\s?\n\s?$/m, '')
|
193
184
|
page.gsub!(/\n\s/,"\n")
|
@@ -201,5 +192,13 @@ class HTTPage
|
|
201
192
|
|
202
193
|
def clean; clean_text end
|
203
194
|
|
195
|
+
# Transliterates text to ASCII and removes unknown characters leaving just words.
|
196
|
+
|
197
|
+
def clean_words(text=nil, enc=nil)
|
198
|
+
clean_text(text, enc).
|
199
|
+
gsub(%r{[.*?]}mi, ' ').
|
200
|
+
gsub(/[^a-z0-9]+/im, ' ')
|
201
|
+
end
|
202
|
+
|
204
203
|
end
|
205
204
|
|
data/lib/httpage.rb
CHANGED
@@ -6,5 +6,14 @@
|
|
6
6
|
# Copyright:: Copyright (c) 2009 Paweł Wilk
|
7
7
|
# License:: LGPL
|
8
8
|
|
9
|
-
require '
|
9
|
+
require 'iconv'
|
10
|
+
require 'htmlentities'
|
11
|
+
require 'net/http'
|
12
|
+
require 'net/https'
|
13
|
+
require 'timeout'
|
14
|
+
require 'zlib'
|
15
|
+
require 'uri'
|
16
|
+
|
17
|
+
require 'bufferaffects'
|
10
18
|
require 'httpage/httpage'
|
19
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: siefca-httpage
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- "Pawe\xC5\x82 Wilk"
|
@@ -22,7 +22,17 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: "0"
|
24
24
|
version:
|
25
|
-
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: bufferaffects
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
description: HTTPage is simple HTTP(S) reader with ability to transliterate body
|
26
36
|
email: pw@gnu.org
|
27
37
|
executables: []
|
28
38
|
|
@@ -33,7 +43,6 @@ extra_rdoc_files: []
|
|
33
43
|
files:
|
34
44
|
- lib/httpage.rb
|
35
45
|
- lib/httpage/httpage.rb
|
36
|
-
- lib/httpage/bufferaffects.rb
|
37
46
|
has_rdoc: true
|
38
47
|
homepage: http://randomseed.pl/httpage
|
39
48
|
post_install_message:
|
@@ -59,6 +68,6 @@ rubyforge_project:
|
|
59
68
|
rubygems_version: 1.2.0
|
60
69
|
signing_key:
|
61
70
|
specification_version: 2
|
62
|
-
summary:
|
71
|
+
summary: HTTPage is simple HTTP(S) reader with ability to transliterate body
|
63
72
|
test_files: []
|
64
73
|
|
@@ -1,224 +0,0 @@
|
|
1
|
-
# = httpage/bufferaffects
|
2
|
-
#
|
3
|
-
# Author:: Paweł Wilk (mailto:pw@gnu.org)
|
4
|
-
# Copyright:: Copyright (c) 2009 Paweł Wilk
|
5
|
-
# License:: LGPL
|
6
|
-
#
|
7
|
-
|
8
|
-
# This module is intended to be used as extension
|
9
|
-
# (class level mixin) for classes using some buffers
|
10
|
-
# that may be altered by calling certain methods.
|
11
|
-
#
|
12
|
-
# It automates resetting of buffers by installing
|
13
|
-
# wrappers for invasive methods you choose. It rewrites
|
14
|
-
# selected methods by adding to them code that calls
|
15
|
-
# buffer(s) flushing method created by you.
|
16
|
-
#
|
17
|
-
# === Markers
|
18
|
-
#
|
19
|
-
# To select which methods are invasive for your buffer(s)
|
20
|
-
# you should use markers which in usage are similar to
|
21
|
-
# accessors, e.g:
|
22
|
-
#
|
23
|
-
# attr_affects_buffers :domain
|
24
|
-
#
|
25
|
-
# Markers may be placed anywhere in the class. Wrapping
|
26
|
-
# routine will wait for methods to be defined if you
|
27
|
-
# mark them too early in your code.
|
28
|
-
#
|
29
|
-
# ==== Marking methods
|
30
|
-
#
|
31
|
-
# To mark methods which should trigger reset operation
|
32
|
-
# when called use method_affects_buffers which takes
|
33
|
-
# comma-separated list of symbols describing names
|
34
|
-
# of these methods.
|
35
|
-
#
|
36
|
-
# ==== Marking attributes (setters)
|
37
|
-
#
|
38
|
-
# The marker attr_affects_buffers is similar but it takes
|
39
|
-
# instance members not methods as arguments. It just installs
|
40
|
-
# hooks for corresponding setters.
|
41
|
-
#
|
42
|
-
# === Buffers flushing method
|
43
|
-
#
|
44
|
-
# Default instance method called to reset buffers should be
|
45
|
-
# defined under name +reset_buffers+
|
46
|
-
# You may also want to set up your own name by calling
|
47
|
-
# buffers_reset_method class method. The name of your
|
48
|
-
# buffers flushing method is passed to subclasses but
|
49
|
-
# each subclass may redefine it.
|
50
|
-
#
|
51
|
-
# Be aware that sub-subclass
|
52
|
-
# will still need redefinition since it's kind of one-level
|
53
|
-
# inheritance.
|
54
|
-
#
|
55
|
-
# Buffers flushing method may take none or exactly one argument.
|
56
|
-
# If your method will take an argument then a name of calling
|
57
|
-
# method will be passed to it as symbol.
|
58
|
-
#
|
59
|
-
# === Inherited classes
|
60
|
-
#
|
61
|
-
# This module tries to be inheritance-safe but you will have to
|
62
|
-
# mark methods and members in subclasses if you are going
|
63
|
-
# to redefine them. The smooth way is of course to use +super+
|
64
|
-
# in overloaded methods so it will also do the job.
|
65
|
-
#
|
66
|
-
# === Caution
|
67
|
-
#
|
68
|
-
# This code uses Module#method_added hook. If you're going
|
69
|
-
# to redefine that method in class using this module remember
|
70
|
-
# to wrap and call original version or add one line to your
|
71
|
-
# definition: +ba_check_method(name)+
|
72
|
-
#
|
73
|
-
# === Example
|
74
|
-
#
|
75
|
-
# class Main
|
76
|
-
#
|
77
|
-
# extend BufferAffects
|
78
|
-
#
|
79
|
-
# buffers_reset_method :reset_path_buffer
|
80
|
-
# attr_affects_buffers :subpart
|
81
|
-
# attr_accessor :subpart, :otherpart
|
82
|
-
#
|
83
|
-
# def reset_path_buffer(name)
|
84
|
-
# @path = nil
|
85
|
-
# p "reset called for #{name}"
|
86
|
-
# end
|
87
|
-
#
|
88
|
-
# def path
|
89
|
-
# @path ||= @subpart.to_s + @otherpart.to_s
|
90
|
-
# end
|
91
|
-
#
|
92
|
-
# end
|
93
|
-
#
|
94
|
-
# obj = Main.new
|
95
|
-
# obj.subpart = 'test'
|
96
|
-
# p obj.path
|
97
|
-
# obj.subpart = '1234'
|
98
|
-
# p obj.path
|
99
|
-
|
100
|
-
module BufferAffects
|
101
|
-
|
102
|
-
@@__ba_wrapped__ = {}
|
103
|
-
@@__ba_reset_m__ = nil
|
104
|
-
|
105
|
-
# This method sets name of method that will be used to reset buffers.
|
106
|
-
|
107
|
-
def buffers_reset_method(name)
|
108
|
-
name = name.to_s.strip
|
109
|
-
raise ArgumentError.new('method name cannot be empty') if name.empty?
|
110
|
-
@__ba_reset_method__ = name.to_sym
|
111
|
-
@@__ba_reset_m__ ||= @__ba_reset_method__
|
112
|
-
end
|
113
|
-
private :buffers_reset_method
|
114
|
-
|
115
|
-
# This method sets the marker for hook to be installed.
|
116
|
-
# It ignores methods for which wrapper already exists.
|
117
|
-
|
118
|
-
def method_affects_buffers(*names)
|
119
|
-
@__ba_methods__ ||= {}
|
120
|
-
names.uniq!
|
121
|
-
names.collect! { |name| name.to_sym }
|
122
|
-
names.delete_if { |name| @__ba_methods__.has_key?(name) }
|
123
|
-
ba_methods_wrap(*names)
|
124
|
-
end
|
125
|
-
private :method_affects_buffers
|
126
|
-
|
127
|
-
# This method searches for setter methods for given
|
128
|
-
# member names and tries to wrap them into buffers
|
129
|
-
# resetting hooks usting method_affects_buffers
|
130
|
-
|
131
|
-
def attr_affects_buffers(*names)
|
132
|
-
names.collect! { |name| :"#{name}=" }
|
133
|
-
method_affects_buffers(*names)
|
134
|
-
end
|
135
|
-
private :attr_affects_buffers
|
136
|
-
|
137
|
-
# This method installs hook for given methods or puts their names
|
138
|
-
# on the queue if methods haven't been defined yet. The queue is
|
139
|
-
# tested each time ba_check_hook is called.
|
140
|
-
#
|
141
|
-
# Each processed method can be in one of 2 states:
|
142
|
-
# * false - method is not processed now
|
143
|
-
# * true - method is now processed
|
144
|
-
#
|
145
|
-
# After successful wrapping method name (key) and object ID (value) pairs
|
146
|
-
# are added two containers: @@__ba_wrapped__ and @__ba_methods__
|
147
|
-
|
148
|
-
def ba_methods_wrap(*names)
|
149
|
-
names.delete_if { |name| @__ba_methods__[name] == true } # don't handle methods being processed
|
150
|
-
kmethods = public_instance_methods +
|
151
|
-
private_instance_methods +
|
152
|
-
protected_instance_methods
|
153
|
-
install_now = names.select { |name| kmethods.include?(name) } # select methods for immediate wrapping
|
154
|
-
install_now.delete_if do |name| # but don't wrap already wrapped
|
155
|
-
@@__ba_wrapped__.has_key?(name) && # - wrapped by our class or other class
|
156
|
-
!@__ba_methods__.has_key?(name) # - not wrapped by our class
|
157
|
-
end
|
158
|
-
|
159
|
-
install_later = names - install_now # collect undefined and wrapped methods
|
160
|
-
install_later.each { |name| @__ba_methods__[name] = false } # and add them to the waiting queue
|
161
|
-
|
162
|
-
install_now.each { |name| @__ba_methods__[name] = true } # mark methods as currently processed
|
163
|
-
installed = ba_install_hook(*install_now) # and install hooks for them
|
164
|
-
install_now.each { |name| @__ba_methods__[name] = false } # mark methods as not processed again
|
165
|
-
installed.each_pair do |name,id| # and note the object IDs of wrapped methods
|
166
|
-
@@__ba_wrapped__[name] = id # shared container
|
167
|
-
@__ba_methods__[name] = id # this class's container
|
168
|
-
end
|
169
|
-
end
|
170
|
-
private :ba_methods_wrap
|
171
|
-
|
172
|
-
# This method checks whether method which name is given
|
173
|
-
# is now available and should be installed.
|
174
|
-
|
175
|
-
def ba_check_method(name)
|
176
|
-
name = name.to_sym
|
177
|
-
@__ba_methods__ ||= {}
|
178
|
-
if @__ba_methods__.has_key?(name)
|
179
|
-
ba_methods_wrap(name)
|
180
|
-
end
|
181
|
-
end
|
182
|
-
private :ba_check_method
|
183
|
-
|
184
|
-
# This method installs hook which alters given methods by wrapping
|
185
|
-
# them into method that invokes buffers resetting routine. It will
|
186
|
-
# not install hook for methods beginning with __ba, which signalizes
|
187
|
-
# that they are wrappers for other methods.
|
188
|
-
|
189
|
-
def ba_install_hook(*names)
|
190
|
-
@__ba_reset_method__ ||= @@__ba_reset_m__
|
191
|
-
@__ba_reset_method__ ||= 'reset_buffers'
|
192
|
-
installed = {}
|
193
|
-
names.uniq.each do |name|
|
194
|
-
new_method = name.to_s
|
195
|
-
next if new_method[0..3] == '__ba'
|
196
|
-
orig_id = instance_method(name.to_sym).object_id
|
197
|
-
orig_method = '__ba' + orig_id.to_s + '__'
|
198
|
-
reset_method = @__ba_reset_method__.to_s
|
199
|
-
module_eval %{
|
200
|
-
alias_method :#{orig_method}, :#{new_method}
|
201
|
-
private :#{orig_method}
|
202
|
-
def #{new_method}(*args, &block)
|
203
|
-
if method(:#{reset_method}).arity == 1
|
204
|
-
#{reset_method}(:#{new_method})
|
205
|
-
else
|
206
|
-
#{reset_method}
|
207
|
-
end
|
208
|
-
return #{orig_method}(*args, &block)
|
209
|
-
end
|
210
|
-
}
|
211
|
-
installed[name] = orig_id
|
212
|
-
end
|
213
|
-
return installed
|
214
|
-
end
|
215
|
-
private :ba_install_hook
|
216
|
-
|
217
|
-
# Hook that intercepts added methods.
|
218
|
-
|
219
|
-
def method_added(name)
|
220
|
-
ba_check_method(name)
|
221
|
-
end
|
222
|
-
|
223
|
-
end
|
224
|
-
|