crawlfish 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -43,7 +43,7 @@
43
43
  <file leaf-file-name="crawlfish.gemspec" pinned="false" current="false" current-in-tab="false">
44
44
  <entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
45
45
  <provider selected="true" editor-type-id="text-editor">
46
- <state line="10" column="21" selection-start="335" selection-end="335" vertical-scroll-proportion="0.0">
46
+ <state line="23" column="0" selection-start="785" selection-end="785" vertical-scroll-proportion="0.0">
47
47
  <folding />
48
48
  </state>
49
49
  </provider>
@@ -70,7 +70,7 @@
70
70
  <file leaf-file-name="crawlfish.rb" pinned="false" current="false" current-in-tab="false">
71
71
  <entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
72
72
  <provider selected="true" editor-type-id="text-editor">
73
- <state line="61" column="3" selection-start="1587" selection-end="1587" vertical-scroll-proportion="0.0">
73
+ <state line="4" column="16" selection-start="72" selection-end="72" vertical-scroll-proportion="0.0">
74
74
  <folding />
75
75
  </state>
76
76
  </provider>
@@ -322,23 +322,23 @@
322
322
  </state>
323
323
  </provider>
324
324
  </entry>
325
- <entry file="file://$PROJECT_DIR$/Gemfile">
325
+ <entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
326
326
  <provider selected="true" editor-type-id="text-editor">
327
- <state line="4" column="0" selection-start="93" selection-end="93" vertical-scroll-proportion="0.0">
327
+ <state line="23" column="0" selection-start="785" selection-end="785" vertical-scroll-proportion="0.0">
328
328
  <folding />
329
329
  </state>
330
330
  </provider>
331
331
  </entry>
332
- <entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
332
+ <entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
333
333
  <provider selected="true" editor-type-id="text-editor">
334
- <state line="10" column="21" selection-start="335" selection-end="335" vertical-scroll-proportion="0.0">
334
+ <state line="4" column="16" selection-start="72" selection-end="72" vertical-scroll-proportion="0.0">
335
335
  <folding />
336
336
  </state>
337
337
  </provider>
338
338
  </entry>
339
- <entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
339
+ <entry file="file://$PROJECT_DIR$/Gemfile">
340
340
  <provider selected="true" editor-type-id="text-editor">
341
- <state line="61" column="3" selection-start="1587" selection-end="1587" vertical-scroll-proportion="0.0">
341
+ <state line="4" column="0" selection-start="93" selection-end="93" vertical-scroll-proportion="0.0">
342
342
  <folding />
343
343
  </state>
344
344
  </provider>
@@ -2,6 +2,7 @@ require 'nokogiri'
2
2
  require 'open-uri'
3
3
 
4
4
  module Crawlfish
5
+ LOL = "giggle"
5
6
  class GoogleScraper
6
7
  attr_accessor :website, :keyword, :user_agent, :start, :i, :position
7
8
  def initialize(options)
@@ -59,4 +60,163 @@ module Crawlfish
59
60
  @current_page += 1
60
61
  end
61
62
  end
62
- end
63
+ end
64
+ =begin
65
+
66
+ module Lol
67
+ def scrape
68
+ require 'gscraper'
69
+ set_keyword
70
+ set_domain
71
+ set_user_agent
72
+ reset_pager
73
+ query_keyword
74
+ # Until position is found OR ten pages are searched, loop through ten pages
75
+ until @position or @pager_position > 10
76
+ search_this_page
77
+ sleep 2.seconds
78
+ next_page
79
+ end
80
+ @position ||= -1 # -1 if not found
81
+ log_the_end_of_the_scrape
82
+ return {:position => @position, :measured_at => Time.now, :engine => "Google"}
83
+ end
84
+
85
+ def create_logger
86
+ @log = Logger.new("#{Rails.root}/log/scraping.log")
87
+ end
88
+ def log(log_message)
89
+ @log.debug log_message
90
+ end
91
+ def query_keyword
92
+ @query = GScraper::Search.query(:query => @keyword)
93
+ end
94
+ def set_keyword
95
+ @keyword = self.text
96
+ end
97
+ def set_domain
98
+ @domain = self.website.url
99
+ end
100
+ def set_user_agent
101
+ @user_agent = get_random_user_agent
102
+ end
103
+ def reset_pager
104
+ @pager_position = 1
105
+ end
106
+ def search_this_page
107
+ log "page: #{@pager_position}"
108
+ begin
109
+ @query.page(@pager_position).each do |result|
110
+ host = URI::parse(URI::extract(result.url.to_s).first).host
111
+ log "#{result.rank}. Checking host: '#{host}' against domain: '#{@domain}'"
112
+ if host == @domain
113
+ @position = result.rank
114
+ end
115
+ end
116
+ rescue
117
+ log "rescued."
118
+ end
119
+ end
120
+ def next_page
121
+ @pager_position += 1
122
+ end
123
+ end
124
+
125
+ # =========================================================================================================================
126
+ module lol2
127
+ def scrape
128
+
129
+ 82
130
+ - until position or start==100 # until position gets set or reaches 10th page
131
+ 83
132
+ - page = (start.to_i+10)/10
133
+ 84
134
+ - log.debug "page: #{page}... "
135
+ 85
136
+ - url = "http://www.google.com/search?q=#{keyword}&start=#{start}"
137
+ 86
138
+ - log.debug "url: #{url}"
139
+ 87
140
+ - #url = Rails.root + "test/google-search/search#{page}.html"
141
+ 88
142
+ - doc = Nokogiri::HTML(open(url, "User-Agent" => user_agent))
143
+ 89
144
+ - links = doc.xpath('//h3/a[contains(@class, "l")]')
145
+ 90
146
+ - #links = doc.search('//h3/a[@class="l"]')
147
+ 91
148
+ -
149
+ 92
150
+ - #if links empty, try a more general scrape
151
+ 93
152
+ - if links.empty?
153
+ 94
154
+ - links = doc.xpath('//h3[@class = "r"]/a]')
155
+ 95
156
+ - end
157
+ 96 66
158
+
159
+ 97
160
+ - if links.empty?
161
+ 98
162
+ - if doc.at_xpath('//div/p').to_s =~ /did not match any documents/
163
+ 99
164
+ - # If no results returned,
165
+ 100
166
+ - log.debug "Page contains no results"
167
+ 101
168
+ - break
169
+ 102
170
+ - else
171
+ 103
172
+ - # No links, but also no "found no results" page. throw error
173
+ 104
174
+ - log.debug doc.to_s
175
+ 105
176
+ - log.debug "Raising: PageContainsNoLinks"
177
+ 106
178
+ - raise "PageContainsNoLinks"
179
+ 107
180
+ - end
181
+ 108
182
+ - end
183
+ 109 67
184
+
185
+ 110
186
+ - links.each do |link|
187
+ 111
188
+ -
189
+ 112
190
+ - # Remove protocol prefix
191
+ 113
192
+ - to_remove = ["http://", "https://"]
193
+ 114
194
+ - reg = Regexp.new(to_remove.map{ |s| "(#{s})" }.join('|'))
195
+ 115
196
+ - link = link['href'].gsub(reg, '')
197
+ 116 68
198
+
199
+ 117
200
+ - # If link start with '/url?q=', remove it
201
+ 118
202
+ - if link[0..6] == '/url?q='
203
+ 119
204
+ - link = link[7..-1]
205
+ 120
206
+ - end
207
+ 121
208
+ -
209
+ 122
210
+ - # If link starts with 'www.', remove it
211
+ 123
212
+ - if link[0..3] == 'www.'
213
+ 124
214
+ - link = link[4..-1]
215
+ 125
216
+ - end
217
+ 126
218
+ -
219
+ 127
220
+ - log.debug "link #{i}. '#{link[0..domain.length-1]}' against '#{domain}'"
221
+ 69
222
+ + def scrape=end
@@ -1,3 +1,3 @@
1
1
  module Crawlfish
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlfish
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 3
10
+ version: 0.0.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dan Neumann