upton 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/upton.rb +55 -9
- data/test/data/easttimor.html +833 -0
- data/test/test_upton.rb +67 -8
- metadata +111 -67
- checksums.yaml +0 -7
data/lib/upton.rb
CHANGED
@@ -12,13 +12,16 @@
|
|
12
12
|
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
13
13
|
# job listings or news articles.
|
14
14
|
|
15
|
+
require 'nokogiri'
|
16
|
+
require './lib/utils'
|
17
|
+
|
15
18
|
module Upton
|
16
19
|
|
17
20
|
# Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
|
18
21
|
# in more complicated cases; e.g. +MyScraper < Upton::Scraper+
|
19
22
|
class Scraper
|
20
23
|
|
21
|
-
attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder
|
24
|
+
attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder, :url_array
|
22
25
|
|
23
26
|
# == Basic use-case methods.
|
24
27
|
|
@@ -27,7 +30,10 @@ module Upton
|
|
27
30
|
# the text of each instance page, (and optionally, its URL and its index
|
28
31
|
# in the list of instance URLs returned by +get_index+).
|
29
32
|
def scrape &blk
|
30
|
-
self.
|
33
|
+
unless self.url_array
|
34
|
+
self.url_array = self.get_index
|
35
|
+
end
|
36
|
+
self.scrape_from_list(self.url_array, blk)
|
31
37
|
end
|
32
38
|
|
33
39
|
|
@@ -40,11 +46,22 @@ module Upton
|
|
40
46
|
#
|
41
47
|
# These options are a shortcut. If you plant to override +get_index+, you
|
42
48
|
# do not need to set them.
|
43
|
-
def initialize(
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
49
|
+
def initialize(index_url_or_array, selector="", selector_method=:xpath)
|
50
|
+
|
51
|
+
#if first arg is a valid URL, do already-written stuff;
|
52
|
+
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
53
|
+
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
54
|
+
|
55
|
+
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
56
|
+
if selector.empty?
|
57
|
+
@url_array = index_url_or_array
|
58
|
+
elsif index_url_or_array =~ URI::ABS_URI
|
59
|
+
@index_url = index_url_or_array
|
60
|
+
@index_selector = selector
|
61
|
+
@index_selector_method = selector_method
|
62
|
+
else
|
63
|
+
raise ArgumentError
|
64
|
+
end
|
48
65
|
# If true, then Upton prints information about when it gets
|
49
66
|
# files from the internet and when it gets them from its stash.
|
50
67
|
@verbose = false
|
@@ -98,6 +115,15 @@ module Upton
|
|
98
115
|
""
|
99
116
|
end
|
100
117
|
|
118
|
+
def scrape_to_csv filename, &blk
|
119
|
+
require 'csv'
|
120
|
+
unless self.url_array
|
121
|
+
self.url_array = self.get_index
|
122
|
+
end
|
123
|
+
CSV.open filename, 'wb' do |csv|
|
124
|
+
self.scrape_from_list(self.url_array, blk).each{|document| document.each{|line| csv << line }}
|
125
|
+
end
|
126
|
+
end
|
101
127
|
|
102
128
|
protected
|
103
129
|
|
@@ -109,20 +135,40 @@ module Upton
|
|
109
135
|
#the filename for each stashed version is a cleaned version of the URL.
|
110
136
|
if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
|
111
137
|
puts "usin' a stashed copy of " + url if @verbose
|
112
|
-
resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read
|
138
|
+
resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
|
113
139
|
else
|
114
140
|
begin
|
115
141
|
puts "getting " + url if @verbose
|
116
142
|
sleep @nice_sleep_time
|
117
143
|
resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
|
144
|
+
|
145
|
+
#this is silly, but rest-client needs to get on their game.
|
146
|
+
#cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
|
147
|
+
if ((200..207).include?(resp.net_http_res.code.to_i) && content_type = resp.net_http_res.content_type)
|
148
|
+
charset = if set = resp.net_http_res.type_params['charset']
|
149
|
+
set
|
150
|
+
elsif content_type == 'text/xml'
|
151
|
+
'us-ascii'
|
152
|
+
elsif content_type.split('/').first == 'text'
|
153
|
+
'iso-8859-1'
|
154
|
+
end
|
155
|
+
puts charset
|
156
|
+
resp.force_encoding(charset) if charset
|
157
|
+
end
|
158
|
+
|
118
159
|
rescue RestClient::ResourceNotFound
|
160
|
+
puts "404 error, skipping: #{url}" if @verbose
|
119
161
|
resp = ""
|
120
162
|
rescue RestClient::InternalServerError
|
163
|
+
puts "500 Error, skipping: #{url}" if @verbose
|
164
|
+
resp = ""
|
165
|
+
rescue URI::InvalidURIError
|
166
|
+
puts "Invalid URI: #{url}" if @verbose
|
121
167
|
resp = ""
|
122
168
|
end
|
123
169
|
if stash
|
124
170
|
puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
|
125
|
-
open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ))}
|
171
|
+
open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
|
126
172
|
end
|
127
173
|
end
|
128
174
|
resp
|