upton 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/upton.rb CHANGED
@@ -12,13 +12,16 @@
12
12
  # 2. Instance pages, which represent the goal of your scraping, e.g.
13
13
  # job listings or news articles.
14
14
 
15
+ require 'nokogiri'
16
+ require './lib/utils'
17
+
15
18
  module Upton
16
19
 
17
20
  # Upton::Scraper can be used as-is for basic use-cases, or can be subclassed
18
21
  # in more complicated cases; e.g. +MyScraper < Upton::Scraper+
19
22
  class Scraper
20
23
 
21
- attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder
24
+ attr_accessor :verbose, :debug, :nice_sleep_time, :stash_folder, :url_array
22
25
 
23
26
  # == Basic use-case methods.
24
27
 
@@ -27,7 +30,10 @@ module Upton
27
30
  # the text of each instance page, (and optionally, its URL and its index
28
31
  # in the list of instance URLs returned by +get_index+).
29
32
  def scrape &blk
30
- self.scrape_from_list(self.get_index, blk)
33
+ unless self.url_array
34
+ self.url_array = self.get_index
35
+ end
36
+ self.scrape_from_list(self.url_array, blk)
31
37
  end
32
38
 
33
39
 
@@ -40,11 +46,22 @@ module Upton
40
46
  #
41
47
  # These options are a shortcut. If you plant to override +get_index+, you
42
48
  # do not need to set them.
43
- def initialize(index_url="", selector="", selector_method=:xpath)
44
- @index_url = index_url
45
- @index_selector = selector
46
- @index_selector_method = selector_method
47
-
49
+ def initialize(index_url_or_array, selector="", selector_method=:xpath)
50
+
51
+ #if first arg is a valid URL, do already-written stuff;
52
+ #if it's not (or if it's a list?) don't bother with get_index, etc.
53
+ #e.g. Scraper.new(["http://jeremybmerrill.com"])
54
+
55
+ #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
56
+ if selector.empty?
57
+ @url_array = index_url_or_array
58
+ elsif index_url_or_array =~ URI::ABS_URI
59
+ @index_url = index_url_or_array
60
+ @index_selector = selector
61
+ @index_selector_method = selector_method
62
+ else
63
+ raise ArgumentError
64
+ end
48
65
  # If true, then Upton prints information about when it gets
49
66
  # files from the internet and when it gets them from its stash.
50
67
  @verbose = false
@@ -98,6 +115,15 @@ module Upton
98
115
  ""
99
116
  end
100
117
 
118
+ def scrape_to_csv filename, &blk
119
+ require 'csv'
120
+ unless self.url_array
121
+ self.url_array = self.get_index
122
+ end
123
+ CSV.open filename, 'wb' do |csv|
124
+ self.scrape_from_list(self.url_array, blk).each{|document| document.each{|line| csv << line }}
125
+ end
126
+ end
101
127
 
102
128
  protected
103
129
 
@@ -109,20 +135,40 @@ module Upton
109
135
  #the filename for each stashed version is a cleaned version of the URL.
110
136
  if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
111
137
  puts "usin' a stashed copy of " + url if @verbose
112
- resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read
138
+ resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
113
139
  else
114
140
  begin
115
141
  puts "getting " + url if @verbose
116
142
  sleep @nice_sleep_time
117
143
  resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
144
+
145
+ #this is silly, but rest-client needs to get on their game.
146
+ #cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
147
+ if ((200..207).include?(resp.net_http_res.code.to_i) && content_type = resp.net_http_res.content_type)
148
+ charset = if set = resp.net_http_res.type_params['charset']
149
+ set
150
+ elsif content_type == 'text/xml'
151
+ 'us-ascii'
152
+ elsif content_type.split('/').first == 'text'
153
+ 'iso-8859-1'
154
+ end
155
+ puts charset
156
+ resp.force_encoding(charset) if charset
157
+ end
158
+
118
159
  rescue RestClient::ResourceNotFound
160
+ puts "404 error, skipping: #{url}" if @verbose
119
161
  resp = ""
120
162
  rescue RestClient::InternalServerError
163
+ puts "500 Error, skipping: #{url}" if @verbose
164
+ resp = ""
165
+ rescue URI::InvalidURIError
166
+ puts "Invalid URI: #{url}" if @verbose
121
167
  resp = ""
122
168
  end
123
169
  if stash
124
170
  puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
125
- open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ))}
171
+ open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
126
172
  end
127
173
  end
128
174
  resp