upton 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/upton.rb +55 -9
- data/test/data/easttimor.html +833 -0
- data/test/test_upton.rb +67 -8
- metadata +111 -67
- checksums.yaml +0 -7
data/lib/upton.rb
CHANGED
@@ -12,13 +12,16 @@
|
|
12
12
|
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
13
13
|
# job listings or news articles.
|
14
14
|
|
15
|
+
require 'nokogiri'
|
16
|
+
require './lib/utils'
|
17
|
+
|
15
18
|
module Upton
|
16
19
|
|
17
20
|
# Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
|
18
21
|
# in more complicated cases; e.g. +MyScraper < Upton::Scraper+
|
19
22
|
class Scraper
|
20
23
|
|
21
|
-
attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder
|
24
|
+
attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder, :url_array
|
22
25
|
|
23
26
|
# == Basic use-case methods.
|
24
27
|
|
@@ -27,7 +30,10 @@ module Upton
|
|
27
30
|
# the text of each instance page, (and optionally, its URL and its index
|
28
31
|
# in the list of instance URLs returned by +get_index+).
|
29
32
|
def scrape &blk
|
30
|
-
self.
|
33
|
+
unless self.url_array
|
34
|
+
self.url_array = self.get_index
|
35
|
+
end
|
36
|
+
self.scrape_from_list(self.url_array, blk)
|
31
37
|
end
|
32
38
|
|
33
39
|
|
@@ -40,11 +46,22 @@ module Upton
|
|
40
46
|
#
|
41
47
|
# These options are a shortcut. If you plant to override +get_index+, you
|
42
48
|
# do not need to set them.
|
43
|
-
def initialize(
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
49
|
+
def initialize(index_url_or_array, selector="", selector_method=:xpath)
|
50
|
+
|
51
|
+
#if first arg is a valid URL, do already-written stuff;
|
52
|
+
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
53
|
+
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
54
|
+
|
55
|
+
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
56
|
+
if selector.empty?
|
57
|
+
@url_array = index_url_or_array
|
58
|
+
elsif index_url_or_array =~ URI::ABS_URI
|
59
|
+
@index_url = index_url_or_array
|
60
|
+
@index_selector = selector
|
61
|
+
@index_selector_method = selector_method
|
62
|
+
else
|
63
|
+
raise ArgumentError
|
64
|
+
end
|
48
65
|
# If true, then Upton prints information about when it gets
|
49
66
|
# files from the internet and when it gets them from its stash.
|
50
67
|
@verbose = false
|
@@ -98,6 +115,15 @@ module Upton
|
|
98
115
|
""
|
99
116
|
end
|
100
117
|
|
118
|
+
def scrape_to_csv filename, &blk
|
119
|
+
require 'csv'
|
120
|
+
unless self.url_array
|
121
|
+
self.url_array = self.get_index
|
122
|
+
end
|
123
|
+
CSV.open filename, 'wb' do |csv|
|
124
|
+
self.scrape_from_list(self.url_array, blk).each{|document| document.each{|line| csv << line }}
|
125
|
+
end
|
126
|
+
end
|
101
127
|
|
102
128
|
protected
|
103
129
|
|
@@ -109,20 +135,40 @@ module Upton
|
|
109
135
|
#the filename for each stashed version is a cleaned version of the URL.
|
110
136
|
if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
|
111
137
|
puts "usin' a stashed copy of " + url if @verbose
|
112
|
-
resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read
|
138
|
+
resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
|
113
139
|
else
|
114
140
|
begin
|
115
141
|
puts "getting " + url if @verbose
|
116
142
|
sleep @nice_sleep_time
|
117
143
|
resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
|
144
|
+
|
145
|
+
#this is silly, but rest-client needs to get on their game.
|
146
|
+
#cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
|
147
|
+
if ((200..207).include?(resp.net_http_res.code.to_i) && content_type = resp.net_http_res.content_type)
|
148
|
+
charset = if set = resp.net_http_res.type_params['charset']
|
149
|
+
set
|
150
|
+
elsif content_type == 'text/xml'
|
151
|
+
'us-ascii'
|
152
|
+
elsif content_type.split('/').first == 'text'
|
153
|
+
'iso-8859-1'
|
154
|
+
end
|
155
|
+
puts charset
|
156
|
+
resp.force_encoding(charset) if charset
|
157
|
+
end
|
158
|
+
|
118
159
|
rescue RestClient::ResourceNotFound
|
160
|
+
puts "404 error, skipping: #{url}" if @verbose
|
119
161
|
resp = ""
|
120
162
|
rescue RestClient::InternalServerError
|
163
|
+
puts "500 Error, skipping: #{url}" if @verbose
|
164
|
+
resp = ""
|
165
|
+
rescue URI::InvalidURIError
|
166
|
+
puts "Invalid URI: #{url}" if @verbose
|
121
167
|
resp = ""
|
122
168
|
end
|
123
169
|
if stash
|
124
170
|
puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
|
125
|
-
open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ))}
|
171
|
+
open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
|
126
172
|
end
|
127
173
|
end
|
128
174
|
resp
|