siefca-htsucker 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/htsucker.rb +34 -0
  2. data/lib/htsucker/htsucker.rb +468 -0
  3. metadata +73 -0
@@ -0,0 +1,34 @@
1
+ # encoding: utf-8
2
+ #
3
+ # HTTP loading and transliteration
4
+ #
5
+ # Author:: Paweł Wilk (mailto:pw@gnu.org)
6
+ # Copyright:: Copyright (c) 2009 Paweł Wilk
7
+ # License:: LGPL
8
+
9
+ require 'iconv'
10
+ require 'htmlentities'
11
+ require 'net/http'
12
+ require 'net/https'
13
+ require 'timeout'
14
+ require 'uri'
15
+
16
+ require 'bufferaffects'
17
+ require './htsucker/domains_to_languages'
18
+ require './htsucker/htsucker'
19
+
20
+ # testing:
21
+
22
+ sites = []
23
+ sites << 'wykop.pl/wykopalisko'
24
+ sites << 'poland.com'
25
+ sites << 'hyperreal.info'
26
+ sites << 'grono.net'
27
+ sites << 'google.pl'
28
+ sites << 'randomseed.pl'
29
+ sites << 'heise-online.de'
30
+
31
+ sites.each do |site|
32
+ pa = HTSucker.new(site)
33
+ puts "#{pa.real_url}: #{pa.language} #{pa.charset}"
34
+ end
@@ -0,0 +1,468 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ class HTSucker
5
+
6
+ include DomainsToLanguages
7
+ include BufferAffects
8
+
9
+ buffers_reset_method :reset_buffers
10
+ attr_affects_buffers :url
11
+
12
+ attr_reader :url
13
+
14
+ # Default options are matrix for defaults used by class method HTSucker.default_options
15
+ # while setting up class variable @@default_options which is used by instances as a
16
+ # matrix for options not given when creating new objects.
17
+
18
+ DefaultOpts = { :redir_retry => 5,
19
+ :conn_retry => 8,
20
+ :total_retry => 2,
21
+ :read_timeout => 15,
22
+ :total_timeout => 30,
23
+ :allow_strange_ports => false,
24
+ :max_length => 524288 }.freeze
25
+
26
+ # Creates new instance of HTSucker. +url+ parameter should be valid URI object or string.
27
+ # You may want to override defaults by issuing hash containing options you want to be changed.
28
+
29
+ def initialize(url, options=nil)
30
+ default_options = self.class.default_options.dup
31
+ if options.respond_to?(:keys)
32
+ unknown = (options.keys - default_options.keys).join(', ')
33
+ raise ArgumentError.new("unknown options: #{unknown}") unless unknown.empty?
34
+ default_options.merge!(options)
35
+ end
36
+ default_options.each_pair do |opt_name,opt_value|
37
+ instance_variable_set("@#{opt_name}", opt_value)
38
+ end
39
+ reset_buffers
40
+ @http_req = nil
41
+ self.url = url
42
+ end
43
+
44
+ # Resets charset and response buffers.
45
+
46
+ def reset_buffers
47
+ @charset = nil
48
+ @content_type = nil
49
+ @response = nil
50
+ @overflow = 0
51
+ @real_url = nil
52
+ end
53
+
54
+ # Sets new url.
55
+
56
+ def url=(url)
57
+ url = URI.parse(url) unless url.kind_of? URI
58
+ url = URI.parse("http://#{url.to_s}") if url.is_a?(URI::Generic)
59
+ url.path = '/' if url.path.nil? || url.path.empty?
60
+ validate_url(url)
61
+ @url = url
62
+ @url.freeze
63
+ @http_req = Net::HTTP::Head.new(@url.path)
64
+ end
65
+
66
+ # Returns top-level domain for URL.
67
+
68
+ def domain
69
+ self.url.host.split('.').last.downcase.to_sym
70
+ end
71
+
72
+ # Returns top-level domain for real URL.
73
+
74
+ def real_domain
75
+ self.real_url.host.split('.').last.downcase.to_sym
76
+ end
77
+
78
+ # Returns resource path.
79
+ def path; url.path end
80
+
81
+ # Returns real resource path.
82
+ def real_path; real_url.path end
83
+
84
+ # Returns hostname.
85
+ def host; url.host end
86
+
87
+ # Returns real hostname.
88
+ def real_host; real_url.host end
89
+
90
+ # Returns used port.
91
+ def port; url.port end
92
+
93
+ # Returns real port.
94
+ def real_port; real_url.port end
95
+
96
+ # Returns protocol.
97
+ def protocol; url.class.name.split('::').last.downcase.to_sym end
98
+
99
+ # Returns real protocol.
100
+ def real_protocol; real_url.class.name.split('::').last.downcase.to_sym end
101
+
102
+ # Returns page charset.
103
+
104
+ def charset
105
+ @content_type, @charset = get_page_info if @charset.nil?
106
+ return @charset
107
+ end
108
+
109
+ def content_charset; charset end
110
+ def content_charset=(x) charset=(x) end
111
+
112
+ # Returns page content-type.
113
+
114
+ def content_type
115
+ @content_type, @charset = get_page_info if @content_type.nil?
116
+ return @content_type
117
+ end
118
+
119
+ # Returns major name of the content-type or nil if something went wrong.
120
+
121
+ def content_type_major
122
+ ctype = self.content_type.to_s
123
+ return nil if ctype.empty?
124
+ ctype = ctype.split('/').first
125
+ return nil if ctype.to_s.empty?
126
+ return ctype.to_sym
127
+ end
128
+
129
+ # Returns minor name of the content-type or nil if something went wrong.
130
+
131
+ def content_type_minor
132
+ ctype = self.content_type.to_s
133
+ return nil if ctype.empty?
134
+ ctype = ctype.split('/')[1]
135
+ return nil if ctype.to_s.empty?
136
+ return ctype.to_sym
137
+ end
138
+
139
+ def validate_url(url)
140
+ raise HTSuckerBadURI.new("malformed URI") if url.to_s.empty?
141
+ u_protocol = url.class.name.split('::').last.upcase
142
+ unless ['HTTP','HTTPS'].include?(u_protocol)
143
+ raise HTSuckerBadProtocol.new("bad protocol: #{u_protocol}")
144
+ end
145
+ unless @allow_strange_ports
146
+ if ((u_protocol == 'HTTP' && url.port != 80) ||
147
+ (u_protocol == 'HTTPS' && url.port != 443))
148
+ raise HTSuckerBadPort.new("strange port number: #{url.port}")
149
+ end
150
+ end
151
+ end
152
+ private :validate_url
153
+
154
+ # Translates top-level domain to spoken language code.
155
+
156
+ def domain_to_spoken
157
+ lang = nil
158
+ enc = self.content_charset.to_s[0..2].downcase.to_sym
159
+ national_encodings = [:iso, :win, :"cp-", :koi, :utf]
160
+ if national_encodings.include?(enc)
161
+ lang = @@domain_to_language[self.real_domain] if real_domain.length == 2
162
+ end
163
+ return lang
164
+ end
165
+ private :domain_to_spoken
166
+
167
+ # Returns content-language or default content language.
168
+
169
+ def content_language(default_content_lanuage='en')
170
+ clang = nil
171
+
172
+ if self.response.nil?
173
+ clang = domain_to_spoken
174
+ return default_content_lanuage
175
+ end
176
+
177
+ # try meta-tag header
178
+ unless self.body.to_s.empty? || self.content_type_major != :text
179
+ header = body.scan(/<meta http-equiv\s*=\s*['"]*content-language['"]*\s*content\s*=\s*['"]*\s*(.*?)\s*['"]*\s*\/?>/i)
180
+ header = header.flatten.first
181
+ clang = extract_content_language(header)
182
+ end
183
+
184
+ # try lang and xml:lang attribute from HTML tag and do the same for body tag
185
+ if clang.to_s.empty? && !self.body.to_s.empty? && self.content_type_major == :text
186
+ header = body.scan(/<x?html\s.*?\s+?lang\s*?=["']*([^"']+).*?\/?>/i)
187
+ header = header.flatten.first
188
+ if header.to_s.empty?
189
+ header = body.scan(/<x?html\s.*?\s+?xml:lang\s*?=["']*([^"']+).*?\/?>/i)
190
+ header = header.flatten.first
191
+ end
192
+ if header.to_s.empty?
193
+ header = body.scan(/<body\s.*?\s+?lang\s*?=["']*([^"']+).*?\/?>/i)
194
+ header = header.flatten.first
195
+ end
196
+ if header.to_s.empty?
197
+ header = body.scan(/<body\s.*?\s+?xml:lang\s*?=["']*([^"']+).*?\/?>/i)
198
+ header = header.flatten.first
199
+ end
200
+ clang = extract_content_language(header)
201
+ end
202
+
203
+ # try server header and in case of 'en' or empty try to figure language by looking at top-domain
204
+ if clang.to_s.empty? && response.respond_to?(:header)
205
+ header = response.header['content-language']
206
+ clang = extract_content_language(header)
207
+ present = clang.to_s
208
+ clang = domain_to_spoken if (present.empty? || present[0..1] == 'en')
209
+ clang = present if (clang.to_s.empty? && !present.empty?)
210
+ end
211
+
212
+ # try default
213
+ clang = default_content_lanuage if clang.to_s.empty?
214
+
215
+ return clang
216
+ end
217
+
218
+ def language; content_language end
219
+ def lang; content_language end
220
+
221
+ # Obtains charset from document body or server response header.
222
+
223
+ def get_page_info(default_content_type='text/html', default_charset='ascii')
224
+ return [default_content_type, default_charset] if self.response.nil?
225
+
226
+ # try meta-tag header
227
+ enc = nil
228
+ ctype = nil
229
+
230
+ # try server header first time to see if we even can analyze the content
231
+ if response.respond_to?(:header)
232
+ header = response.header['content-type']
233
+ first_ctype = extract_content_type(header).to_s.split('/').first
234
+ end
235
+
236
+ unless (self.body.to_s.empty? || first_ctype != 'text')
237
+ header = body.scan(/<meta http-equiv\s*=\s*['"]*content-type['"]*\s*content\s*=\s*['"]*\s*(.*?)\s*['"]*\s*\/?>/i)
238
+ header = header.flatten.first
239
+ enc = extract_charset(header)
240
+ ctype = extract_content_type(header)
241
+ end
242
+
243
+ # try server header
244
+ if (ctype.to_s.empty? && response.respond_to?(:header))
245
+ header = response.header['content-type']
246
+ ctype = extract_content_type(header)
247
+ enc = extract_charset(header) if enc.to_s.empty? # weird but may happend (page with charset encoding but without type)
248
+ end
249
+
250
+ # try defaults
251
+ enc = default_charset if enc.to_s.empty?
252
+ ctype = default_content_type if ctype.to_s.empty?
253
+
254
+ return [ctype, enc]
255
+ end
256
+ private :get_page_info
257
+
258
+ # Extracts charset from content-type string.
259
+
260
+ def extract_charset(enc_string)
261
+ return nil if enc_string.nil? || enc_string.empty?
262
+ ret_enc = nil
263
+ ct = enc_string.chomp.downcase.squeeze(' ')
264
+ unless ct.nil?
265
+ ctary = {}
266
+ ct.split(';').each do |segment|
267
+ k,v = segment.split('=')
268
+ ctary[k.strip.to_sym] = v unless (k.nil? || v.nil?)
269
+ end
270
+ if ctary.has_key?(:charset)
271
+ begin
272
+ test_enc = ctary[:charset]
273
+ test_enc = 'utf-8' if test_enc == 'utf8'
274
+ ret_enc = Encoding.find(test_enc)
275
+ ret_enc = ret_enc.name
276
+ rescue ArgumentError
277
+ end
278
+ end
279
+ end
280
+ ret_enc = nil if ret_enc.nil? || ret_enc.squeeze(" ").empty?
281
+ return ret_enc.to_s.downcase.to_sym
282
+ end
283
+ private :extract_charset
284
+
285
+ # Extracts content-type from content-type string.
286
+
287
+ def extract_content_type(ctype_string)
288
+ return nil if ctype_string.to_s.empty?
289
+ ct = ctype_string.chomp.squeeze(' ').split(';').first
290
+ ct = ct.strip.downcase.to_sym unless ct.nil?
291
+ return ct
292
+ end
293
+ private :extract_content_type
294
+
295
+ # Extracts content-language from content-language string.
296
+
297
+ def extract_content_language(ltype_string)
298
+ return nil if ltype_string.to_s.empty?
299
+ lt = ltype_string.chomp.squeeze(' ').split(';').first.split(',').first
300
+ lt = lt.strip.downcase.to_sym unless lt.nil?
301
+ return lt
302
+ end
303
+ private :extract_content_language
304
+
305
+
306
+ # Fetches document using HTTP and returns response object. It also sets charset.
307
+
308
+ def response
309
+ return @response unless @response.nil?
310
+ url = @url
311
+ found = false
312
+ response = nil
313
+ @real_url = nil
314
+ http_req = @http_req
315
+ redir_retry = @redir_retry
316
+ conn_retry = @conn_retry
317
+
318
+ until found do
319
+ begin
320
+ status = Timeout::timeout(@timeout) do
321
+ case url.scheme.downcase.to_sym
322
+ when :http
323
+ response = Net::HTTP.start(url.host, url.port) { |http| http.request(http_req) }
324
+ when :https
325
+ https = Net::HTTP.new(url.host, url.port)
326
+ https.use_ssl = true
327
+ https.verify_mode = OpenSSL::SSL::VERIFY_NONE
328
+ response = https.start { |http| http.request(http_req) }
329
+ else
330
+ return nil
331
+ end
332
+ end
333
+ response.value
334
+ rescue Net::HTTPRetriableError
335
+ conn_retry -= 1
336
+ if response.respond_to?(:header) && !response.header['location'].nil? && !response.header['location'].empty?
337
+ url = URI.parse(response.header['location'])
338
+ validate_url(url)
339
+ http_req = Net::HTTP::Head.new(url.path)
340
+ redir_retry -= 1
341
+ end
342
+ rescue
343
+ return nil
344
+ end
345
+ if response.kind_of?(Net::HTTPOK)
346
+ found = true
347
+ break
348
+ end
349
+ break if (redir_retry < 0 || conn_retry < 0)
350
+ end
351
+ if found
352
+ @real_url = url
353
+ @response = response
354
+ @content_length = response.header['content-length'].to_s.to_i
355
+ if @content_length > @max_length
356
+ raise HTSuckerContentTooBig.new("content length (#{@content_length}) is greater than declared limit (#{@max_length})")
357
+ end
358
+ openuri_opts = { :redirect=>false, :read_timeout=>false }
359
+ resource = open(@real_url.to_s, openuri_opts)
360
+ resource.read(@max_length)
361
+ @content_type, @charset = get_page_info(nil,nil) # using just server headers
362
+
363
+ return response
364
+ else
365
+ return nil
366
+ end
367
+ end
368
+
369
+ # Returns document body.
370
+
371
+ def body
372
+ r = self.response
373
+ return r.respond_to?(:body) ? r.body : nil
374
+ end
375
+
376
+ # Alias for body.
377
+
378
+ def fetch(*args); body(*args) end
379
+
380
+ # Returns URL used while obtaining content (e.g. after redirection).
381
+
382
+ def real_url
383
+ return nil if self.response.nil?
384
+ return @real_url
385
+ end
386
+
387
+ # Strips HTML tags from document.
388
+
389
+ def strip_html(text=nil)
390
+ text ||= self.body
391
+ @coder ||= HTMLEntities.new
392
+ r = text.tr("\t", ' ')
393
+ r.tr!("\r", '')
394
+ r.sub!(%r{<body.*?>(.*?)</body>}mi, '\1')
395
+ r.gsub!(%r{<script.*?>(.*?)</script>}mi, ' ')
396
+ r.gsub!(%r{<style.*?>(.*?)</style>}mi, ' ')
397
+ r.gsub!(%r{<!--.*?-->}mi, ' ')
398
+ r.gsub!(/<br\s*\/?>|<p>/mi, "\n")
399
+ r.gsub!(/<.*?>/m, '')
400
+ return coder.decode(r)
401
+ end
402
+
403
+ # Transliterates text to ASCII and removes unknown characters.
404
+
405
+ def clean_text(text=nil, enc=nil)
406
+ text ||= self.body
407
+ enc ||= self.charset
408
+ @transliterator ||= Iconv.new('ASCII//TRANSLIT//IGNORE', 'UTF-8')
409
+ page = Iconv.iconv('UTF-8//IGNORE', enc, text).join
410
+ page = strip_html(page)
411
+ page.gsub!(/['`]/m, '_amp__')
412
+ page = @transliterator.conv(page).downcase
413
+ page.tr!(".!?", ' ')
414
+ page.gsub!(/[^\x00-\x7F]+/, '')
415
+ page.gsub!(/[^a-z0-9\-_\[\]\(\)\*\=\@\#\$\%\^\&\{\}\:\;\,\<\>\+\s\n\.\!\?]+/im, '')
416
+ page.gsub!('_amp__',"'")
417
+ page.squeeze!(" \n")
418
+ page.gsub!(/^\s?\n\s?$/m, '')
419
+ page.gsub!(/\n\s/,"\n")
420
+ page.gsub!(/\s\n/,"\n")
421
+ page.gsub!(/^\s+/,'')
422
+ page.gsub!(/(^|\s)\'+(.*?)\'+(\s|$)/m,'\1\2\3')
423
+ page.gsub!(/(^|\s)\'+(\s|$)/, '')
424
+ page.squeeze!("\n ")
425
+ return page
426
+ end
427
+
428
+ def clean; clean_text end
429
+
430
+ # Transliterates text to ASCII and removes unknown characters leaving just words.
431
+
432
+ def clean_words(text=nil, enc=nil)
433
+ cw = clean_text(text, enc)
434
+ cw.gsub!(/\[\s*?[^\:]+?\:\/+?.*?\]/mi, ' ')
435
+ cw.gsub!(/\[\s*?(\d|\s|[^\w])+\]/mi, ' ')
436
+ cw.gsub!(/[^a-z0-9]+/im, ' ')
437
+ cw.squeeze!(' ')
438
+ return cw
439
+ end
440
+
441
+ # Transliterates text to ASCII, removes unknown characters and returns array of words.
442
+
443
+ def words
444
+ self.clean_words.split(' ')
445
+ end
446
+
447
+ # Use this class method to set up default options used when creating new objects.
448
+ # For each option that you omit it will be taken from constant hash called DefaultOpts.
449
+ # Default options hash is stored in @@default_options. This method will return current
450
+ # default options when called without parameter.
451
+
452
+ def self.default_options(opts=nil)
453
+ @@default_options ||= DefaultOpts.dup
454
+ return @@default_options.freeze if opts.nil?
455
+ if opts.respond_to?(:keys)
456
+ known_opts = DefaultOpts.keys
457
+ unknown = (opts.keys - known_opts).join(', ')
458
+ raise ArgumentError.new("unknown options: #{unknown}") unless unknown.empty?
459
+ @@default_options.unfreeze
460
+ @@default_options.merge!(opts)
461
+ return @@default_options.freeze
462
+ else
463
+ raise ArgumentError.new("malformed options")
464
+ end
465
+ end
466
+
467
+ end
468
+
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: siefca-htsucker
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - "Pawe\xC5\x82 Wilk"
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-04-28 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: htmlentities
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: bufferaffects
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ description: HTSucker is simple HTTP(S) reader with ability to transliterate body
36
+ email: pw@gnu.org
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - lib/htsucker.rb
45
+ - lib/htsucker/htsucker.rb
46
+ has_rdoc: true
47
+ homepage: http://randomseed.pl/htsucker
48
+ post_install_message:
49
+ rdoc_options: []
50
+
51
+ require_paths:
52
+ - lib
53
+ required_ruby_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ version: "0"
58
+ version:
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ requirements: []
66
+
67
+ rubyforge_project:
68
+ rubygems_version: 1.2.0
69
+ signing_key:
70
+ specification_version: 2
71
+ summary: HTSucker is simple HTTP(S) reader with ability to transliterate body
72
+ test_files: []
73
+