siefca-htsucker 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/htsucker.rb +34 -0
  2. data/lib/htsucker/htsucker.rb +468 -0
  3. metadata +73 -0
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+ #
3
+ # HTTP loading and transliteration
4
+ #
5
+ # Author:: Paweł Wilk (mailto:pw@gnu.org)
6
+ # Copyright:: Copyright (c) 2009 Paweł Wilk
7
+ # License:: LGPL
8
+
9
+ require 'iconv'
10
+ require 'htmlentities'
11
+ require 'net/http'
12
+ require 'net/https'
13
+ require 'timeout'
14
+ require 'uri'
15
+
16
+ require 'bufferaffects'
17
+ require './htsucker/domains_to_languages'
18
+ require './htsucker/htsucker'
19
+
20
+ # testing:
21
+
22
+ sites = []
23
+ sites << 'wykop.pl/wykopalisko'
24
+ sites << 'poland.com'
25
+ sites << 'hyperreal.info'
26
+ sites << 'grono.net'
27
+ sites << 'google.pl'
28
+ sites << 'randomseed.pl'
29
+ sites << 'heise-online.de'
30
+
31
+ sites.each do |site|
32
+ pa = HTSucker.new(site)
33
+ puts "#{pa.real_url}: #{pa.language} #{pa.charset}"
34
+ end
@@ -0,0 +1,468 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ class HTSucker
5
+
6
+ include DomainsToLanguages
7
+ include BufferAffects
8
+
9
+ buffers_reset_method :reset_buffers
10
+ attr_affects_buffers :url
11
+
12
+ attr_reader :url
13
+
14
+ # Default options are matrix for defaults used by class method HTSucker.default_options
15
+ # while setting up class variable @@default_options which is used by instances as a
16
+ # matrix for options not given when creating new objects.
17
+
18
+ DefaultOpts = { :redir_retry => 5,
19
+ :conn_retry => 8,
20
+ :total_retry => 2,
21
+ :read_timeout => 15,
22
+ :total_timeout => 30,
23
+ :allow_strange_ports => false,
24
+ :max_length => 524288 }.freeze
25
+
26
+ # Creates new instance of HTSucker. +url+ parameter should be valid URI object or string.
27
+ # You may want to override defaults by issuing hash containing options you want to be changed.
28
+
29
+ def initialize(url, options=nil)
30
+ default_options = self.class.default_options.dup
31
+ if options.respond_to?(:keys)
32
+ unknown = (options.keys - default_options.keys).join(', ')
33
+ raise ArgumentError.new("unknown options: #{unknown}") unless unknown.empty?
34
+ default_options.merge!(options)
35
+ end
36
+ default_options.each_pair do |opt_name,opt_value|
37
+ instance_variable_set("@#{opt_name}", opt_value)
38
+ end
39
+ reset_buffers
40
+ @http_req = nil
41
+ self.url = url
42
+ end
43
+
44
+ # Resets charset and response buffers.
45
+
46
+ def reset_buffers
47
+ @charset = nil
48
+ @content_type = nil
49
+ @response = nil
50
+ @overflow = 0
51
+ @real_url = nil
52
+ end
53
+
54
+ # Sets new url.
55
+
56
+ def url=(url)
57
+ url = URI.parse(url) unless url.kind_of? URI
58
+ url = URI.parse("http://#{url.to_s}") if url.is_a?(URI::Generic)
59
+ url.path = '/' if url.path.nil? || url.path.empty?
60
+ validate_url(url)
61
+ @url = url
62
+ @url.freeze
63
+ @http_req = Net::HTTP::Head.new(@url.path)
64
+ end
65
+
66
+ # Returns top-level domain for URL.
67
+
68
+ def domain
69
+ self.url.host.split('.').last.downcase.to_sym
70
+ end
71
+
72
+ # Returns top-level domain for real URL.
73
+
74
+ def real_domain
75
+ self.real_url.host.split('.').last.downcase.to_sym
76
+ end
77
+
78
+ # Returns resource path.
79
+ def path; url.path end
80
+
81
+ # Returns real resource path.
82
+ def real_path; real_url.path end
83
+
84
+ # Returns hostname.
85
+ def host; url.host end
86
+
87
+ # Returns real hostname.
88
+ def real_host; real_url.host end
89
+
90
+ # Returns used port.
91
+ def port; url.port end
92
+
93
+ # Returns real port.
94
+ def real_port; real_url.port end
95
+
96
+ # Returns protocol.
97
+ def protocol; url.class.name.split('::').last.downcase.to_sym end
98
+
99
+ # Returns real protocol.
100
+ def real_protocol; real_url.class.name.split('::').last.downcase.to_sym end
101
+
102
+ # Returns page charset.
103
+
104
+ def charset
105
+ @content_type, @charset = get_page_info if @charset.nil?
106
+ return @charset
107
+ end
108
+
109
+ def content_charset; charset end
110
+ def content_charset=(x) charset=(x) end
111
+
112
+ # Returns page content-type.
113
+
114
+ def content_type
115
+ @content_type, @charset = get_page_info if @content_type.nil?
116
+ return @content_type
117
+ end
118
+
119
+ # Returns major name of the content-type or nil if something went wrong.
120
+
121
+ def content_type_major
122
+ ctype = self.content_type.to_s
123
+ return nil if ctype.empty?
124
+ ctype = ctype.split('/').first
125
+ return nil if ctype.to_s.empty?
126
+ return ctype.to_sym
127
+ end
128
+
129
+ # Returns minor name of the content-type or nil if something went wrong.
130
+
131
+ def content_type_minor
132
+ ctype = self.content_type.to_s
133
+ return nil if ctype.empty?
134
+ ctype = ctype.split('/')[1]
135
+ return nil if ctype.to_s.empty?
136
+ return ctype.to_sym
137
+ end
138
+
139
+ def validate_url(url)
140
+ raise HTSuckerBadURI.new("malformed URI") if url.to_s.empty?
141
+ u_protocol = url.class.name.split('::').last.upcase
142
+ unless ['HTTP','HTTPS'].include?(u_protocol)
143
+ raise HTSuckerBadProtocol.new("bad protocol: #{u_protocol}")
144
+ end
145
+ unless @allow_strange_ports
146
+ if ((u_protocol == 'HTTP' && url.port != 80) ||
147
+ (u_protocol == 'HTTPS' && url.port != 443))
148
+ raise HTSuckerBadPort.new("strange port number: #{url.port}")
149
+ end
150
+ end
151
+ end
152
+ private :validate_url
153
+
154
+ # Translates top-level domain to spoken language code.
155
+
156
+ def domain_to_spoken
157
+ lang = nil
158
+ enc = self.content_charset.to_s[0..2].downcase.to_sym
159
+ national_encodings = [:iso, :win, :"cp-", :koi, :utf]
160
+ if national_encodings.include?(enc)
161
+ lang = @@domain_to_language[self.real_domain] if real_domain.length == 2
162
+ end
163
+ return lang
164
+ end
165
+ private :domain_to_spoken
166
+
167
+ # Returns content-language or default content language.
168
+
169
+ def content_language(default_content_lanuage='en')
170
+ clang = nil
171
+
172
+ if self.response.nil?
173
+ clang = domain_to_spoken
174
+ return default_content_lanuage
175
+ end
176
+
177
+ # try meta-tag header
178
+ unless self.body.to_s.empty? || self.content_type_major != :text
179
+ header = body.scan(/<meta http-equiv\s*=\s*['"]*content-language['"]*\s*content\s*=\s*['"]*\s*(.*?)\s*['"]*\s*\/?>/i)
180
+ header = header.flatten.first
181
+ clang = extract_content_language(header)
182
+ end
183
+
184
+ # try lang and xml:lang attribute from HTML tag and do the same for body tag
185
+ if clang.to_s.empty? && !self.body.to_s.empty? && self.content_type_major == :text
186
+ header = body.scan(/<x?html\s.*?\s+?lang\s*?=["']*([^"']+).*?\/?>/i)
187
+ header = header.flatten.first
188
+ if header.to_s.empty?
189
+ header = body.scan(/<x?html\s.*?\s+?xml:lang\s*?=["']*([^"']+).*?\/?>/i)
190
+ header = header.flatten.first
191
+ end
192
+ if header.to_s.empty?
193
+ header = body.scan(/<body\s.*?\s+?lang\s*?=["']*([^"']+).*?\/?>/i)
194
+ header = header.flatten.first
195
+ end
196
+ if header.to_s.empty?
197
+ header = body.scan(/<body\s.*?\s+?xml:lang\s*?=["']*([^"']+).*?\/?>/i)
198
+ header = header.flatten.first
199
+ end
200
+ clang = extract_content_language(header)
201
+ end
202
+
203
+ # try server header and in case of 'en' or empty try to figure language by looking at top-domain
204
+ if clang.to_s.empty? && response.respond_to?(:header)
205
+ header = response.header['content-language']
206
+ clang = extract_content_language(header)
207
+ present = clang.to_s
208
+ clang = domain_to_spoken if (present.empty? || present[0..1] == 'en')
209
+ clang = present if (clang.to_s.empty? && !present.empty?)
210
+ end
211
+
212
+ # try default
213
+ clang = default_content_lanuage if clang.to_s.empty?
214
+
215
+ return clang
216
+ end
217
+
218
+ def language; content_language end
219
+ def lang; content_language end
220
+
221
+ # Obtains charset from document body or server response header.
222
+
223
+ def get_page_info(default_content_type='text/html', default_charset='ascii')
224
+ return [default_content_type, default_charset] if self.response.nil?
225
+
226
+ # try meta-tag header
227
+ enc = nil
228
+ ctype = nil
229
+
230
+ # try server header first time to see if we even can analyze the content
231
+ if response.respond_to?(:header)
232
+ header = response.header['content-type']
233
+ first_ctype = extract_content_type(header).to_s.split('/').first
234
+ end
235
+
236
+ unless (self.body.to_s.empty? || first_ctype != 'text')
237
+ header = body.scan(/<meta http-equiv\s*=\s*['"]*content-type['"]*\s*content\s*=\s*['"]*\s*(.*?)\s*['"]*\s*\/?>/i)
238
+ header = header.flatten.first
239
+ enc = extract_charset(header)
240
+ ctype = extract_content_type(header)
241
+ end
242
+
243
+ # try server header
244
+ if (ctype.to_s.empty? && response.respond_to?(:header))
245
+ header = response.header['content-type']
246
+ ctype = extract_content_type(header)
247
+ enc = extract_charset(header) if enc.to_s.empty? # weird but may happend (page with charset encoding but without type)
248
+ end
249
+
250
+ # try defaults
251
+ enc = default_charset if enc.to_s.empty?
252
+ ctype = default_content_type if ctype.to_s.empty?
253
+
254
+ return [ctype, enc]
255
+ end
256
+ private :get_page_info
257
+
258
+ # Extracts charset from content-type string.
259
+
260
+ def extract_charset(enc_string)
261
+ return nil if enc_string.nil? || enc_string.empty?
262
+ ret_enc = nil
263
+ ct = enc_string.chomp.downcase.squeeze(' ')
264
+ unless ct.nil?
265
+ ctary = {}
266
+ ct.split(';').each do |segment|
267
+ k,v = segment.split('=')
268
+ ctary[k.strip.to_sym] = v unless (k.nil? || v.nil?)
269
+ end
270
+ if ctary.has_key?(:charset)
271
+ begin
272
+ test_enc = ctary[:charset]
273
+ test_enc = 'utf-8' if test_enc == 'utf8'
274
+ ret_enc = Encoding.find(test_enc)
275
+ ret_enc = ret_enc.name
276
+ rescue ArgumentError
277
+ end
278
+ end
279
+ end
280
+ ret_enc = nil if ret_enc.nil? || ret_enc.squeeze(" ").empty?
281
+ return ret_enc.to_s.downcase.to_sym
282
+ end
283
+ private :extract_charset
284
+
285
+ # Extracts content-type from content-type string.
286
+
287
+ def extract_content_type(ctype_string)
288
+ return nil if ctype_string.to_s.empty?
289
+ ct = ctype_string.chomp.squeeze(' ').split(';').first
290
+ ct = ct.strip.downcase.to_sym unless ct.nil?
291
+ return ct
292
+ end
293
+ private :extract_content_type
294
+
295
+ # Extracts content-language from content-language string.
296
+
297
+ def extract_content_language(ltype_string)
298
+ return nil if ltype_string.to_s.empty?
299
+ lt = ltype_string.chomp.squeeze(' ').split(';').first.split(',').first
300
+ lt = lt.strip.downcase.to_sym unless lt.nil?
301
+ return lt
302
+ end
303
+ private :extract_content_language
304
+
305
+
306
+ # Fetches document using HTTP and returns response object. It also sets charset.
307
+
308
+ def response
309
+ return @response unless @response.nil?
310
+ url = @url
311
+ found = false
312
+ response = nil
313
+ @real_url = nil
314
+ http_req = @http_req
315
+ redir_retry = @redir_retry
316
+ conn_retry = @conn_retry
317
+
318
+ until found do
319
+ begin
320
+ status = Timeout::timeout(@timeout) do
321
+ case url.scheme.downcase.to_sym
322
+ when :http
323
+ response = Net::HTTP.start(url.host, url.port) { |http| http.request(http_req) }
324
+ when :https
325
+ https = Net::HTTP.new(url.host, url.port)
326
+ https.use_ssl = true
327
+ https.verify_mode = OpenSSL::SSL::VERIFY_NONE
328
+ response = https.start { |http| http.request(http_req) }
329
+ else
330
+ return nil
331
+ end
332
+ end
333
+ response.value
334
+ rescue Net::HTTPRetriableError
335
+ conn_retry -= 1
336
+ if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty?
337
+ url = URI.parse(response.header['location'])
338
+ validate_url(url)
339
+ http_req = Net::HTTP::Head.new(url.path)
340
+ redir_retry -= 1
341
+ end
342
+ rescue
343
+ return nil
344
+ end
345
+ if response.kind_of?(Net::HTTPOK)
346
+ found = true
347
+ break
348
+ end
349
+ break if (redir_retry < 0 || conn_retry < 0)
350
+ end
351
+ if found
352
+ @real_url = url
353
+ @response = response
354
+ @content_length = response.header['content-length'].to_s.to_i
355
+ if @content_length > @max_length
356
+ raise HTSuckerContentTooBig.new("content length (#{@content_length}) is greater than declared limit (#{@max_length})")
357
+ end
358
+ openuri_opts = { :redirect=>false, :read_timeout=>false }
359
+ resource = open(@real_url.to_s, openuri_opts)
360
+ resource.read(@max_length)
361
+ @content_type, @charset = get_page_info(nil,nil) # using just server headers
362
+
363
+ return response
364
+ else
365
+ return nil
366
+ end
367
+ end
368
+
369
+ # Returns document body.
370
+
371
+ def body
372
+ r = self.response
373
+ return r.respond_to?(:body) ? r.body : nil
374
+ end
375
+
376
+ # Alias for body.
377
+
378
+ def fetch(*args); body(*args) end
379
+
380
+ # Returns URL used while obtaining content (e.g. after redirection).
381
+
382
+ def real_url
383
+ return nil if self.response.nil?
384
+ return @real_url
385
+ end
386
+
387
+ # Strips HTML tags from document.
388
+
389
+ def strip_html(text=nil)
390
+ text ||= self.body
391
+ @coder ||= HTMLEntities.new
392
+ r = text.tr("\t", ' ')
393
+ r.tr!("\r", '')
394
+ r.sub!(%r{<body.*?>(.*?)</body>}mi, '\1')
395
+ r.gsub!(%r{<script.*?>(.*?)</script>}mi, ' ')
396
+ r.gsub!(%r{<style.*?>(.*?)</style>}mi, ' ')
397
+ r.gsub!(%r{<!--.*?-->}mi, ' ')
398
+ r.gsub!(/<br\s*\/?>|<p>/mi, "\n")
399
+ r.gsub!(/<.*?>/m, '')
400
+ return coder.decode(r)
401
+ end
402
+
403
+ # Transliterates text to ASCII and removes unknown characters.
404
+
405
+ def clean_text(text=nil, enc=nil)
406
+ text ||= self.body
407
+ enc ||= self.charset
408
+ @transliterator ||= Iconv.new('ASCII//TRANSLIT//IGNORE', 'UTF-8')
409
+ page = Iconv.iconv('UTF-8//IGNORE', enc, text).join
410
+ page = strip_html(page)
411
+ page.gsub!(/['`]/m, '_amp__')
412
+ page = @transliterator.conv(page).downcase
413
+ page.tr!(".!?", ' ')
414
+ page.gsub!(/[^\x00-\x7F]+/, '')
415
+ page.gsub!(/[^a-z0-9\-_\[\]\(\)\*\=\@\#\$\%\^\&\{\}\:\;\,\<\>\+\s\n\.\!\?]+/im, '')
416
+ page.gsub!('_amp__',"'")
417
+ page.squeeze!(" \n")
418
+ page.gsub!(/^\s?\n\s?$/m, '')
419
+ page.gsub!(/\n\s/,"\n")
420
+ page.gsub!(/\s\n/,"\n")
421
+ page.gsub!(/^\s+/,'')
422
+ page.gsub!(/(^|\s)\'+(.*?)\'+(\s|$)/m,'\1\2\3')
423
+ page.gsub!(/(^|\s)\'+(\s|$)/, '')
424
+ page.squeeze!("\n ")
425
+ return page
426
+ end
427
+
428
+ def clean; clean_text end
429
+
430
+ # Transliterates text to ASCII and removes unknown characters leaving just words.
431
+
432
+ def clean_words(text=nil, enc=nil)
433
+ cw = clean_text(text, enc)
434
+ cw.gsub!(/\[\s*?[^\:]+?\:\/+?.*?\]/mi, ' ')
435
+ cw.gsub!(/\[\s*?(\d|\s|[^\w])+\]/mi, ' ')
436
+ cw.gsub!(/[^a-z0-9]+/im, ' ')
437
+ cw.squeeze!(' ')
438
+ return cw
439
+ end
440
+
441
+ # Transliterates text to ASCII, removes unknown characters and returns array of words.
442
+
443
+ def words
444
+ self.clean_words.split(' ')
445
+ end
446
+
447
+ # Use this class method to set up default options used when creating new objects.
448
+ # For each option that you omit it will be taken from constant hash called DefaultOpts.
449
+ # Default options hash is stored in @@default_options. This method will return current
450
+ # default options when called without parameter.
451
+
452
+ def self.default_options(opts=nil)
453
+ @@default_options ||= DefaultOpts.dup
454
+ return @@default_options.freeze if opts.nil?
455
+ if opts.respond_to?(:keys)
456
+ known_opts = DefaultOpts.keys
457
+ unknown = (opts.keys - known_opts).join(', ')
458
+ raise ArgumentError.new("unknown options: #{unknown}") unless unknown.empty?
459
+ @@default_options.unfreeze
460
+ @@default_options.merge!(opts)
461
+ return @@default_options.freeze
462
+ else
463
+ raise ArgumentError.new("malformed options")
464
+ end
465
+ end
466
+
467
+ end
468
+
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: siefca-htsucker
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - "Pawe\xC5\x82 Wilk"
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-04-28 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: htmlentities
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: bufferaffects
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ description: HTSucker is simple HTTP(S) reader with ability to transliterate body
36
+ email: pw@gnu.org
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - lib/htsucker.rb
45
+ - lib/htsucker/htsucker.rb
46
+ has_rdoc: true
47
+ homepage: http://randomseed.pl/htsucker
48
+ post_install_message:
49
+ rdoc_options: []
50
+
51
+ require_paths:
52
+ - lib
53
+ required_ruby_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: "0"
58
+ version:
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ requirements: []
66
+
67
+ rubyforge_project:
68
+ rubygems_version: 1.2.0
69
+ signing_key:
70
+ specification_version: 2
71
+ summary: HTSucker is simple HTTP(S) reader with ability to transliterate body
72
+ test_files: []
73
+