crawlfish 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.idea/workspace.xml +8 -8
- data/lib/crawlfish.rb +161 -1
- data/lib/crawlfish/version.rb +1 -1
- metadata +3 -3
data/.idea/workspace.xml
CHANGED
@@ -43,7 +43,7 @@
|
|
43
43
|
<file leaf-file-name="crawlfish.gemspec" pinned="false" current="false" current-in-tab="false">
|
44
44
|
<entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
|
45
45
|
<provider selected="true" editor-type-id="text-editor">
|
46
|
-
<state line="
|
46
|
+
<state line="23" column="0" selection-start="785" selection-end="785" vertical-scroll-proportion="0.0">
|
47
47
|
<folding />
|
48
48
|
</state>
|
49
49
|
</provider>
|
@@ -70,7 +70,7 @@
|
|
70
70
|
<file leaf-file-name="crawlfish.rb" pinned="false" current="false" current-in-tab="false">
|
71
71
|
<entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
|
72
72
|
<provider selected="true" editor-type-id="text-editor">
|
73
|
-
<state line="
|
73
|
+
<state line="4" column="16" selection-start="72" selection-end="72" vertical-scroll-proportion="0.0">
|
74
74
|
<folding />
|
75
75
|
</state>
|
76
76
|
</provider>
|
@@ -322,23 +322,23 @@
|
|
322
322
|
</state>
|
323
323
|
</provider>
|
324
324
|
</entry>
|
325
|
-
<entry file="file://$PROJECT_DIR$/
|
325
|
+
<entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
|
326
326
|
<provider selected="true" editor-type-id="text-editor">
|
327
|
-
<state line="
|
327
|
+
<state line="23" column="0" selection-start="785" selection-end="785" vertical-scroll-proportion="0.0">
|
328
328
|
<folding />
|
329
329
|
</state>
|
330
330
|
</provider>
|
331
331
|
</entry>
|
332
|
-
<entry file="file://$PROJECT_DIR$/crawlfish.
|
332
|
+
<entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
|
333
333
|
<provider selected="true" editor-type-id="text-editor">
|
334
|
-
<state line="
|
334
|
+
<state line="4" column="16" selection-start="72" selection-end="72" vertical-scroll-proportion="0.0">
|
335
335
|
<folding />
|
336
336
|
</state>
|
337
337
|
</provider>
|
338
338
|
</entry>
|
339
|
-
<entry file="file://$PROJECT_DIR$/
|
339
|
+
<entry file="file://$PROJECT_DIR$/Gemfile">
|
340
340
|
<provider selected="true" editor-type-id="text-editor">
|
341
|
-
<state line="
|
341
|
+
<state line="4" column="0" selection-start="93" selection-end="93" vertical-scroll-proportion="0.0">
|
342
342
|
<folding />
|
343
343
|
</state>
|
344
344
|
</provider>
|
data/lib/crawlfish.rb
CHANGED
@@ -2,6 +2,7 @@ require 'nokogiri'
|
|
2
2
|
require 'open-uri'
|
3
3
|
|
4
4
|
module Crawlfish
|
5
|
+
LOL = "giggle"
|
5
6
|
class GoogleScraper
|
6
7
|
attr_accessor :website, :keyword, :user_agent, :start, :i, :position
|
7
8
|
def initialize(options)
|
@@ -59,4 +60,163 @@ module Crawlfish
|
|
59
60
|
@current_page += 1
|
60
61
|
end
|
61
62
|
end
|
62
|
-
end
|
63
|
+
end
|
64
|
+
=begin
|
65
|
+
|
66
|
+
module Lol
|
67
|
+
def scrape
|
68
|
+
require 'gscraper'
|
69
|
+
set_keyword
|
70
|
+
set_domain
|
71
|
+
set_user_agent
|
72
|
+
reset_pager
|
73
|
+
query_keyword
|
74
|
+
# Until position is found OR ten pages are searched, loop through ten pages
|
75
|
+
until @position or @pager_position > 10
|
76
|
+
search_this_page
|
77
|
+
sleep 2.seconds
|
78
|
+
next_page
|
79
|
+
end
|
80
|
+
@position ||= -1 # -1 if not found
|
81
|
+
log_the_end_of_the_scrape
|
82
|
+
return {:position => @position, :measured_at => Time.now, :engine => "Google"}
|
83
|
+
end
|
84
|
+
|
85
|
+
def create_logger
|
86
|
+
@log = Logger.new("#{Rails.root}/log/scraping.log")
|
87
|
+
end
|
88
|
+
def log(log_message)
|
89
|
+
@log.debug log_message
|
90
|
+
end
|
91
|
+
def query_keyword
|
92
|
+
@query = GScraper::Search.query(:query => @keyword)
|
93
|
+
end
|
94
|
+
def set_keyword
|
95
|
+
@keyword = self.text
|
96
|
+
end
|
97
|
+
def set_domain
|
98
|
+
@domain = self.website.url
|
99
|
+
end
|
100
|
+
def set_user_agent
|
101
|
+
@user_agent = get_random_user_agent
|
102
|
+
end
|
103
|
+
def reset_pager
|
104
|
+
@pager_position = 1
|
105
|
+
end
|
106
|
+
def search_this_page
|
107
|
+
log "page: #{@pager_position}"
|
108
|
+
begin
|
109
|
+
@query.page(@pager_position).each do |result|
|
110
|
+
host = URI::parse(URI::extract(result.url.to_s).first).host
|
111
|
+
log "#{result.rank}. Checking host: '#{host}' against domain: '#{@domain}'"
|
112
|
+
if host == @domain
|
113
|
+
@position = result.rank
|
114
|
+
end
|
115
|
+
end
|
116
|
+
rescue
|
117
|
+
log "rescued."
|
118
|
+
end
|
119
|
+
end
|
120
|
+
def next_page
|
121
|
+
@pager_position += 1
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# =========================================================================================================================
|
126
|
+
module lol2
|
127
|
+
def scrape
|
128
|
+
|
129
|
+
82
|
130
|
+
- until position or start==100 # until position gets set or reaches 10th page
|
131
|
+
83
|
132
|
+
- page = (start.to_i+10)/10
|
133
|
+
84
|
134
|
+
- log.debug "page: #{page}... "
|
135
|
+
85
|
136
|
+
- url = "http://www.google.com/search?q=#{keyword}&start=#{start}"
|
137
|
+
86
|
138
|
+
- log.debug "url: #{url}"
|
139
|
+
87
|
140
|
+
- #url = Rails.root + "test/google-search/search#{page}.html"
|
141
|
+
88
|
142
|
+
- doc = Nokogiri::HTML(open(url, "User-Agent" => user_agent))
|
143
|
+
89
|
144
|
+
- links = doc.xpath('//h3/a[contains(@class, "l")]')
|
145
|
+
90
|
146
|
+
- #links = doc.search('//h3/a[@class="l"]')
|
147
|
+
91
|
148
|
+
-
|
149
|
+
92
|
150
|
+
- #if links empty, try a more general scrape
|
151
|
+
93
|
152
|
+
- if links.empty?
|
153
|
+
94
|
154
|
+
- links = doc.xpath('//h3[@class = "r"]/a]')
|
155
|
+
95
|
156
|
+
- end
|
157
|
+
96 66
|
158
|
+
|
159
|
+
97
|
160
|
+
- if links.empty?
|
161
|
+
98
|
162
|
+
- if doc.at_xpath('//div/p').to_s =~ /did not match any documents/
|
163
|
+
99
|
164
|
+
- # If no results returned,
|
165
|
+
100
|
166
|
+
- log.debug "Page contains no results"
|
167
|
+
101
|
168
|
+
- break
|
169
|
+
102
|
170
|
+
- else
|
171
|
+
103
|
172
|
+
- # No links, but also no "found no results" page. throw error
|
173
|
+
104
|
174
|
+
- log.debug doc.to_s
|
175
|
+
105
|
176
|
+
- log.debug "Raising: PageContainsNoLinks"
|
177
|
+
106
|
178
|
+
- raise "PageContainsNoLinks"
|
179
|
+
107
|
180
|
+
- end
|
181
|
+
108
|
182
|
+
- end
|
183
|
+
109 67
|
184
|
+
|
185
|
+
110
|
186
|
+
- links.each do |link|
|
187
|
+
111
|
188
|
+
-
|
189
|
+
112
|
190
|
+
- # Remove protocol prefix
|
191
|
+
113
|
192
|
+
- to_remove = ["http://", "https://"]
|
193
|
+
114
|
194
|
+
- reg = Regexp.new(to_remove.map{ |s| "(#{s})" }.join('|'))
|
195
|
+
115
|
196
|
+
- link = link['href'].gsub(reg, '')
|
197
|
+
116 68
|
198
|
+
|
199
|
+
117
|
200
|
+
- # If link start with '/url?q=', remove it
|
201
|
+
118
|
202
|
+
- if link[0..6] == '/url?q='
|
203
|
+
119
|
204
|
+
- link = link[7..-1]
|
205
|
+
120
|
206
|
+
- end
|
207
|
+
121
|
208
|
+
-
|
209
|
+
122
|
210
|
+
- # If link starts with 'www.', remove it
|
211
|
+
123
|
212
|
+
- if link[0..3] == 'www.'
|
213
|
+
124
|
214
|
+
- link = link[4..-1]
|
215
|
+
125
|
216
|
+
- end
|
217
|
+
126
|
218
|
+
-
|
219
|
+
127
|
220
|
+
- log.debug "link #{i}. '#{link[0..domain.length-1]}' against '#{domain}'"
|
221
|
+
69
|
222
|
+
+ def scrape=end
|
data/lib/crawlfish/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawlfish
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Dan Neumann
|