siefca-httpage 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/httpage/httpage.rb +49 -22
- metadata +1 -1
data/lib/httpage/httpage.rb
CHANGED
@@ -9,16 +9,19 @@ class HTTPage
|
|
9
9
|
attr_affects_buffers :url, :encoding
|
10
10
|
|
11
11
|
attr_accessor :redir_retry, :conn_retry, :timeout, :url
|
12
|
-
|
12
|
+
attr_reader :real_url
|
13
|
+
attr_writer :encoding
|
13
14
|
|
14
15
|
def initialize(url,redir_retry=5,conn_retry=8,timeout=40)
|
15
|
-
@encoding
|
16
|
-
@
|
17
|
-
@
|
18
|
-
@
|
19
|
-
@
|
20
|
-
@
|
21
|
-
|
16
|
+
@encoding = nil
|
17
|
+
@content_type = nil
|
18
|
+
@response = nil
|
19
|
+
@http_req = nil
|
20
|
+
@redir_retry = redir_retry
|
21
|
+
@conn_retry = conn_retry
|
22
|
+
@timeout = timeout
|
23
|
+
@real_url = nil
|
24
|
+
self.url = url
|
22
25
|
end
|
23
26
|
|
24
27
|
# Resets encoding and response buffers.
|
@@ -39,31 +42,42 @@ class HTTPage
|
|
39
42
|
# Returns page encoding.
|
40
43
|
|
41
44
|
def encoding
|
42
|
-
@encoding
|
45
|
+
@encoding, @content_type = get_page_info if @encoding.nil?
|
46
|
+
return @encoding
|
47
|
+
end
|
48
|
+
|
49
|
+
# Returns page content-type.
|
50
|
+
|
51
|
+
def content_type
|
52
|
+
@encoding, @content_type = get_page_info if @content_type.nil?
|
53
|
+
return @content_type
|
43
54
|
end
|
44
55
|
|
45
56
|
# Obtains encoding from document body or server response header.
|
46
57
|
|
47
|
-
def
|
48
|
-
return default_encoding if self.response.nil?
|
58
|
+
def get_page_info(default_encoding='ascii', default_content_type='text/html')
|
59
|
+
return [default_content_type, default_encoding] if self.response.nil?
|
49
60
|
|
50
61
|
# try meta-tag header
|
51
62
|
header = self.response.body.scan(/<meta http-equiv\s*=\s*['"]*content-type['"]*\s*content\s*=\s*['"]*\s*(.*?)\s*['"]*\s*\/?>/i)
|
52
63
|
header = header.flatten.first
|
53
64
|
enc = extract_encoding(header)
|
65
|
+
ctype = extract_content_type(header)
|
54
66
|
|
55
67
|
# try server header
|
56
|
-
if
|
68
|
+
if ctype.nil?
|
57
69
|
header = response.header['content-type']
|
70
|
+
ctype = extract_content_type(header)
|
58
71
|
enc = extract_encoding(header)
|
59
72
|
end
|
60
|
-
|
73
|
+
|
61
74
|
# try default
|
62
|
-
enc
|
75
|
+
enc = default_encoding if enc.nil?
|
76
|
+
ctype = default_content_type if ctype.nil?
|
63
77
|
|
64
|
-
return enc
|
78
|
+
return [ctype, enc]
|
65
79
|
end
|
66
|
-
private :
|
80
|
+
private :get_page_info
|
67
81
|
|
68
82
|
# Extracts enconding from content-type string.
|
69
83
|
|
@@ -92,16 +106,28 @@ class HTTPage
|
|
92
106
|
end
|
93
107
|
private :extract_encoding
|
94
108
|
|
109
|
+
# Extracts content-type from content-type string.
|
110
|
+
|
111
|
+
def extract_content_type(ctype_string)
|
112
|
+
return nil if ctype_string.nil? || ctype_string.empty?
|
113
|
+
ct = ctype_string.chomp.squeeze(' ').split(';').first
|
114
|
+
ct = ct.strip.downcase.to_sym unless ct.nil?
|
115
|
+
return ct
|
116
|
+
end
|
117
|
+
private :extract_content_type
|
118
|
+
|
119
|
+
|
95
120
|
# Fetches document using HTTP and returns response object. It also sets encoding.
|
96
121
|
|
97
122
|
def response
|
98
123
|
return @response unless @response.nil?
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
124
|
+
url = @url
|
125
|
+
found = false
|
126
|
+
response = nil
|
127
|
+
@real_url = nil
|
128
|
+
http_req = @http_req
|
103
129
|
redir_retry = @redir_retry
|
104
|
-
conn_retry
|
130
|
+
conn_retry = @conn_retry
|
105
131
|
|
106
132
|
until found do
|
107
133
|
begin
|
@@ -136,8 +162,9 @@ class HTTPage
|
|
136
162
|
break if (redir_retry < 0 || conn_retry < 0)
|
137
163
|
end
|
138
164
|
if found
|
165
|
+
@real_url = url
|
139
166
|
@response = response
|
140
|
-
@encoding =
|
167
|
+
@encoding, @content_type = get_page_info
|
141
168
|
return response
|
142
169
|
else
|
143
170
|
return nil
|