upton 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/upton.rb CHANGED
@@ -12,13 +12,16 @@
12
12
  # 2. Instance pages, which represent the goal of your scraping, e.g.
13
13
  # job listings or news articles.
14
14
 
15
+ require 'nokogiri'
16
+ require './lib/utils'
17
+
15
18
  module Upton
16
19
 
17
20
  # Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
18
21
  # in more complicated cases; e.g. +MyScraper < Upton::Scraper+
19
22
  class Scraper
20
23
 
21
- attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder
24
+ attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder, :url_array
22
25
 
23
26
  # == Basic use-case methods.
24
27
 
@@ -27,7 +30,10 @@ module Upton
27
30
  # the text of each instance page, (and optionally, its URL and its index
28
31
  # in the list of instance URLs returned by +get_index+).
29
32
  def scrape &blk
30
- self.scrape_from_list(self.get_index, blk)
33
+ unless self.url_array
34
+ self.url_array = self.get_index
35
+ end
36
+ self.scrape_from_list(self.url_array, blk)
31
37
  end
32
38
 
33
39
 
@@ -40,11 +46,22 @@ module Upton
40
46
  #
41
47
  # These options are a shortcut. If you plant to override +get_index+, you
42
48
  # do not need to set them.
43
- def initialize(index_url="", selector="", selector_method=:xpath)
44
- @index_url = index_url
45
- @index_selector = selector
46
- @index_selector_method = selector_method
47
-
49
+ def initialize(index_url_or_array, selector="", selector_method=:xpath)
50
+
51
+ #if first arg is a valid URL, do already-written stuff;
52
+ #if it's not (or if it's a list?) don't bother with get_index, etc.
53
+ #e.g. Scraper.new(["http://jeremybmerrill.com"])
54
+
55
+ #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
56
+ if selector.empty?
57
+ @url_array = index_url_or_array
58
+ elsif index_url_or_array =~ URI::ABS_URI
59
+ @index_url = index_url_or_array
60
+ @index_selector = selector
61
+ @index_selector_method = selector_method
62
+ else
63
+ raise ArgumentError
64
+ end
48
65
  # If true, then Upton prints information about when it gets
49
66
  # files from the internet and when it gets them from its stash.
50
67
  @verbose = false
@@ -98,6 +115,15 @@ module Upton
98
115
  ""
99
116
  end
100
117
 
118
+ def scrape_to_csv filename, &blk
119
+ require 'csv'
120
+ unless self.url_array
121
+ self.url_array = self.get_index
122
+ end
123
+ CSV.open filename, 'wb' do |csv|
124
+ self.scrape_from_list(self.url_array, blk).each{|document| document.each{|line| csv << line }}
125
+ end
126
+ end
101
127
 
102
128
  protected
103
129
 
@@ -109,20 +135,40 @@ module Upton
109
135
  #the filename for each stashed version is a cleaned version of the URL.
110
136
  if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
111
137
  puts "usin' a stashed copy of " + url if @verbose
112
- resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read
138
+ resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
113
139
  else
114
140
  begin
115
141
  puts "getting " + url if @verbose
116
142
  sleep @nice_sleep_time
117
143
  resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
144
+
145
+ #this is silly, but rest-client needs to get on their game.
146
+ #cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
147
+ if ((200..207).include?(resp.net_http_res.code.to_i) && content_type = resp.net_http_res.content_type)
148
+ charset = if set = resp.net_http_res.type_params['charset']
149
+ set
150
+ elsif content_type == 'text/xml'
151
+ 'us-ascii'
152
+ elsif content_type.split('/').first == 'text'
153
+ 'iso-8859-1'
154
+ end
155
+ puts charset
156
+ resp.force_encoding(charset) if charset
157
+ end
158
+
118
159
  rescue RestClient::ResourceNotFound
160
+ puts "404 error, skipping: #{url}" if @verbose
119
161
  resp = ""
120
162
  rescue RestClient::InternalServerError
163
+ puts "500 Error, skipping: #{url}" if @verbose
164
+ resp = ""
165
+ rescue URI::InvalidURIError
166
+ puts "Invalid URI: #{url}" if @verbose
121
167
  resp = ""
122
168
  end
123
169
  if stash
124
170
  puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
125
- open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ))}
171
+ open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
126
172
  end
127
173
  end
128
174
  resp