crawlfish 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -43,7 +43,7 @@
43
43
  <file leaf-file-name="crawlfish.gemspec" pinned="false" current="false" current-in-tab="false">
44
44
  <entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
45
45
  <provider selected="true" editor-type-id="text-editor">
46
- <state line="10" column="21" selection-start="335" selection-end="335" vertical-scroll-proportion="0.0">
46
+ <state line="23" column="0" selection-start="785" selection-end="785" vertical-scroll-proportion="0.0">
47
47
  <folding />
48
48
  </state>
49
49
  </provider>
@@ -70,7 +70,7 @@
70
70
  <file leaf-file-name="crawlfish.rb" pinned="false" current="false" current-in-tab="false">
71
71
  <entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
72
72
  <provider selected="true" editor-type-id="text-editor">
73
- <state line="61" column="3" selection-start="1587" selection-end="1587" vertical-scroll-proportion="0.0">
73
+ <state line="4" column="16" selection-start="72" selection-end="72" vertical-scroll-proportion="0.0">
74
74
  <folding />
75
75
  </state>
76
76
  </provider>
@@ -322,23 +322,23 @@
322
322
  </state>
323
323
  </provider>
324
324
  </entry>
325
- <entry file="file://$PROJECT_DIR$/Gemfile">
325
+ <entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
326
326
  <provider selected="true" editor-type-id="text-editor">
327
- <state line="4" column="0" selection-start="93" selection-end="93" vertical-scroll-proportion="0.0">
327
+ <state line="23" column="0" selection-start="785" selection-end="785" vertical-scroll-proportion="0.0">
328
328
  <folding />
329
329
  </state>
330
330
  </provider>
331
331
  </entry>
332
- <entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
332
+ <entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
333
333
  <provider selected="true" editor-type-id="text-editor">
334
- <state line="10" column="21" selection-start="335" selection-end="335" vertical-scroll-proportion="0.0">
334
+ <state line="4" column="16" selection-start="72" selection-end="72" vertical-scroll-proportion="0.0">
335
335
  <folding />
336
336
  </state>
337
337
  </provider>
338
338
  </entry>
339
- <entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
339
+ <entry file="file://$PROJECT_DIR$/Gemfile">
340
340
  <provider selected="true" editor-type-id="text-editor">
341
- <state line="61" column="3" selection-start="1587" selection-end="1587" vertical-scroll-proportion="0.0">
341
+ <state line="4" column="0" selection-start="93" selection-end="93" vertical-scroll-proportion="0.0">
342
342
  <folding />
343
343
  </state>
344
344
  </provider>
@@ -2,6 +2,7 @@ require 'nokogiri'
2
2
  require 'open-uri'
3
3
 
4
4
  module Crawlfish
5
+ LOL = "giggle"
5
6
  class GoogleScraper
6
7
  attr_accessor :website, :keyword, :user_agent, :start, :i, :position
7
8
  def initialize(options)
@@ -59,4 +60,163 @@ module Crawlfish
59
60
  @current_page += 1
60
61
  end
61
62
  end
62
- end
63
+ end
64
+ =begin
65
+
66
+ module Lol
67
+ def scrape
68
+ require 'gscraper'
69
+ set_keyword
70
+ set_domain
71
+ set_user_agent
72
+ reset_pager
73
+ query_keyword
74
+ # Until position is found OR ten pages are searched, loop through ten pages
75
+ until @position or @pager_position > 10
76
+ search_this_page
77
+ sleep 2.seconds
78
+ next_page
79
+ end
80
+ @position ||= -1 # -1 if not found
81
+ log_the_end_of_the_scrape
82
+ return {:position => @position, :measured_at => Time.now, :engine => "Google"}
83
+ end
84
+
85
+ def create_logger
86
+ @log = Logger.new("#{Rails.root}/log/scraping.log")
87
+ end
88
+ def log(log_message)
89
+ @log.debug log_message
90
+ end
91
+ def query_keyword
92
+ @query = GScraper::Search.query(:query => @keyword)
93
+ end
94
+ def set_keyword
95
+ @keyword = self.text
96
+ end
97
+ def set_domain
98
+ @domain = self.website.url
99
+ end
100
+ def set_user_agent
101
+ @user_agent = get_random_user_agent
102
+ end
103
+ def reset_pager
104
+ @pager_position = 1
105
+ end
106
+ def search_this_page
107
+ log "page: #{@pager_position}"
108
+ begin
109
+ @query.page(@pager_position).each do |result|
110
+ host = URI::parse(URI::extract(result.url.to_s).first).host
111
+ log "#{result.rank}. Checking host: '#{host}' against domain: '#{@domain}'"
112
+ if host == @domain
113
+ @position = result.rank
114
+ end
115
+ end
116
+ rescue
117
+ log "rescued."
118
+ end
119
+ end
120
+ def next_page
121
+ @pager_position += 1
122
+ end
123
+ end
124
+
125
+ # =========================================================================================================================
126
+ module lol2
127
+ def scrape
128
+
129
+ 82
130
+ - until position or start==100 # until position gets set or reaches 10th page
131
+ 83
132
+ - page = (start.to_i+10)/10
133
+ 84
134
+ - log.debug "page: #{page}... "
135
+ 85
136
+ - url = "http://www.google.com/search?q=#{keyword}&start=#{start}"
137
+ 86
138
+ - log.debug "url: #{url}"
139
+ 87
140
+ - #url = Rails.root + "test/google-search/search#{page}.html"
141
+ 88
142
+ - doc = Nokogiri::HTML(open(url, "User-Agent" => user_agent))
143
+ 89
144
+ - links = doc.xpath('//h3/a[contains(@class, "l")]')
145
+ 90
146
+ - #links = doc.search('//h3/a[@class="l"]')
147
+ 91
148
+ -
149
+ 92
150
+ - #if links empty, try a more general scrape
151
+ 93
152
+ - if links.empty?
153
+ 94
154
+ - links = doc.xpath('//h3[@class = "r"]/a]')
155
+ 95
156
+ - end
157
+ 96 66
158
+
159
+ 97
160
+ - if links.empty?
161
+ 98
162
+ - if doc.at_xpath('//div/p').to_s =~ /did not match any documents/
163
+ 99
164
+ - # If no results returned,
165
+ 100
166
+ - log.debug "Page contains no results"
167
+ 101
168
+ - break
169
+ 102
170
+ - else
171
+ 103
172
+ - # No links, but also no "found no results" page. throw error
173
+ 104
174
+ - log.debug doc.to_s
175
+ 105
176
+ - log.debug "Raising: PageContainsNoLinks"
177
+ 106
178
+ - raise "PageContainsNoLinks"
179
+ 107
180
+ - end
181
+ 108
182
+ - end
183
+ 109 67
184
+
185
+ 110
186
+ - links.each do |link|
187
+ 111
188
+ -
189
+ 112
190
+ - # Remove protocol prefix
191
+ 113
192
+ - to_remove = ["http://", "https://"]
193
+ 114
194
+ - reg = Regexp.new(to_remove.map{ |s| "(#{s})" }.join('|'))
195
+ 115
196
+ - link = link['href'].gsub(reg, '')
197
+ 116 68
198
+
199
+ 117
200
+ - # If link start with '/url?q=', remove it
201
+ 118
202
+ - if link[0..6] == '/url?q='
203
+ 119
204
+ - link = link[7..-1]
205
+ 120
206
+ - end
207
+ 121
208
+ -
209
+ 122
210
+ - # If link starts with 'www.', remove it
211
+ 123
212
+ - if link[0..3] == 'www.'
213
+ 124
214
+ - link = link[4..-1]
215
+ 125
216
+ - end
217
+ 126
218
+ -
219
+ 127
220
+ - log.debug "link #{i}. '#{link[0..domain.length-1]}' against '#{domain}'"
221
+ 69
222
+ + def scrape=end
@@ -1,3 +1,3 @@
1
1
  module Crawlfish
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crawlfish
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 25
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 3
10
+ version: 0.0.3
11
11
  platform: ruby
12
12
  authors:
13
13
  - Dan Neumann