crawlfish 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.idea/workspace.xml +8 -8
- data/lib/crawlfish.rb +161 -1
- data/lib/crawlfish/version.rb +1 -1
- metadata +3 -3
data/.idea/workspace.xml
CHANGED
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
<file leaf-file-name="crawlfish.gemspec" pinned="false" current="false" current-in-tab="false">
|
|
44
44
|
<entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
|
|
45
45
|
<provider selected="true" editor-type-id="text-editor">
|
|
46
|
-
<state line="
|
|
46
|
+
<state line="23" column="0" selection-start="785" selection-end="785" vertical-scroll-proportion="0.0">
|
|
47
47
|
<folding />
|
|
48
48
|
</state>
|
|
49
49
|
</provider>
|
|
@@ -70,7 +70,7 @@
|
|
|
70
70
|
<file leaf-file-name="crawlfish.rb" pinned="false" current="false" current-in-tab="false">
|
|
71
71
|
<entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
|
|
72
72
|
<provider selected="true" editor-type-id="text-editor">
|
|
73
|
-
<state line="
|
|
73
|
+
<state line="4" column="16" selection-start="72" selection-end="72" vertical-scroll-proportion="0.0">
|
|
74
74
|
<folding />
|
|
75
75
|
</state>
|
|
76
76
|
</provider>
|
|
@@ -322,23 +322,23 @@
|
|
|
322
322
|
</state>
|
|
323
323
|
</provider>
|
|
324
324
|
</entry>
|
|
325
|
-
<entry file="file://$PROJECT_DIR$/
|
|
325
|
+
<entry file="file://$PROJECT_DIR$/crawlfish.gemspec">
|
|
326
326
|
<provider selected="true" editor-type-id="text-editor">
|
|
327
|
-
<state line="
|
|
327
|
+
<state line="23" column="0" selection-start="785" selection-end="785" vertical-scroll-proportion="0.0">
|
|
328
328
|
<folding />
|
|
329
329
|
</state>
|
|
330
330
|
</provider>
|
|
331
331
|
</entry>
|
|
332
|
-
<entry file="file://$PROJECT_DIR$/crawlfish.
|
|
332
|
+
<entry file="file://$PROJECT_DIR$/lib/crawlfish.rb">
|
|
333
333
|
<provider selected="true" editor-type-id="text-editor">
|
|
334
|
-
<state line="
|
|
334
|
+
<state line="4" column="16" selection-start="72" selection-end="72" vertical-scroll-proportion="0.0">
|
|
335
335
|
<folding />
|
|
336
336
|
</state>
|
|
337
337
|
</provider>
|
|
338
338
|
</entry>
|
|
339
|
-
<entry file="file://$PROJECT_DIR$/
|
|
339
|
+
<entry file="file://$PROJECT_DIR$/Gemfile">
|
|
340
340
|
<provider selected="true" editor-type-id="text-editor">
|
|
341
|
-
<state line="
|
|
341
|
+
<state line="4" column="0" selection-start="93" selection-end="93" vertical-scroll-proportion="0.0">
|
|
342
342
|
<folding />
|
|
343
343
|
</state>
|
|
344
344
|
</provider>
|
data/lib/crawlfish.rb
CHANGED
|
@@ -2,6 +2,7 @@ require 'nokogiri'
|
|
|
2
2
|
require 'open-uri'
|
|
3
3
|
|
|
4
4
|
module Crawlfish
|
|
5
|
+
LOL = "giggle"
|
|
5
6
|
class GoogleScraper
|
|
6
7
|
attr_accessor :website, :keyword, :user_agent, :start, :i, :position
|
|
7
8
|
def initialize(options)
|
|
@@ -59,4 +60,163 @@ module Crawlfish
|
|
|
59
60
|
@current_page += 1
|
|
60
61
|
end
|
|
61
62
|
end
|
|
62
|
-
end
|
|
63
|
+
end
|
|
64
|
+
=begin
|
|
65
|
+
|
|
66
|
+
module Lol
|
|
67
|
+
def scrape
|
|
68
|
+
require 'gscraper'
|
|
69
|
+
set_keyword
|
|
70
|
+
set_domain
|
|
71
|
+
set_user_agent
|
|
72
|
+
reset_pager
|
|
73
|
+
query_keyword
|
|
74
|
+
# Until position is found OR ten pages are searched, loop through ten pages
|
|
75
|
+
until @position or @pager_position > 10
|
|
76
|
+
search_this_page
|
|
77
|
+
sleep 2.seconds
|
|
78
|
+
next_page
|
|
79
|
+
end
|
|
80
|
+
@position ||= -1 # -1 if not found
|
|
81
|
+
log_the_end_of_the_scrape
|
|
82
|
+
return {:position => @position, :measured_at => Time.now, :engine => "Google"}
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def create_logger
|
|
86
|
+
@log = Logger.new("#{Rails.root}/log/scraping.log")
|
|
87
|
+
end
|
|
88
|
+
def log(log_message)
|
|
89
|
+
@log.debug log_message
|
|
90
|
+
end
|
|
91
|
+
def query_keyword
|
|
92
|
+
@query = GScraper::Search.query(:query => @keyword)
|
|
93
|
+
end
|
|
94
|
+
def set_keyword
|
|
95
|
+
@keyword = self.text
|
|
96
|
+
end
|
|
97
|
+
def set_domain
|
|
98
|
+
@domain = self.website.url
|
|
99
|
+
end
|
|
100
|
+
def set_user_agent
|
|
101
|
+
@user_agent = get_random_user_agent
|
|
102
|
+
end
|
|
103
|
+
def reset_pager
|
|
104
|
+
@pager_position = 1
|
|
105
|
+
end
|
|
106
|
+
def search_this_page
|
|
107
|
+
log "page: #{@pager_position}"
|
|
108
|
+
begin
|
|
109
|
+
@query.page(@pager_position).each do |result|
|
|
110
|
+
host = URI::parse(URI::extract(result.url.to_s).first).host
|
|
111
|
+
log "#{result.rank}. Checking host: '#{host}' against domain: '#{@domain}'"
|
|
112
|
+
if host == @domain
|
|
113
|
+
@position = result.rank
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
rescue
|
|
117
|
+
log "rescued."
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
def next_page
|
|
121
|
+
@pager_position += 1
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# =========================================================================================================================
|
|
126
|
+
module lol2
|
|
127
|
+
def scrape
|
|
128
|
+
|
|
129
|
+
82
|
|
130
|
+
- until position or start==100 # until position gets set or reaches 10th page
|
|
131
|
+
83
|
|
132
|
+
- page = (start.to_i+10)/10
|
|
133
|
+
84
|
|
134
|
+
- log.debug "page: #{page}... "
|
|
135
|
+
85
|
|
136
|
+
- url = "http://www.google.com/search?q=#{keyword}&start=#{start}"
|
|
137
|
+
86
|
|
138
|
+
- log.debug "url: #{url}"
|
|
139
|
+
87
|
|
140
|
+
- #url = Rails.root + "test/google-search/search#{page}.html"
|
|
141
|
+
88
|
|
142
|
+
- doc = Nokogiri::HTML(open(url, "User-Agent" => user_agent))
|
|
143
|
+
89
|
|
144
|
+
- links = doc.xpath('//h3/a[contains(@class, "l")]')
|
|
145
|
+
90
|
|
146
|
+
- #links = doc.search('//h3/a[@class="l"]')
|
|
147
|
+
91
|
|
148
|
+
-
|
|
149
|
+
92
|
|
150
|
+
- #if links empty, try a more general scrape
|
|
151
|
+
93
|
|
152
|
+
- if links.empty?
|
|
153
|
+
94
|
|
154
|
+
- links = doc.xpath('//h3[@class = "r"]/a]')
|
|
155
|
+
95
|
|
156
|
+
- end
|
|
157
|
+
96 66
|
|
158
|
+
|
|
159
|
+
97
|
|
160
|
+
- if links.empty?
|
|
161
|
+
98
|
|
162
|
+
- if doc.at_xpath('//div/p').to_s =~ /did not match any documents/
|
|
163
|
+
99
|
|
164
|
+
- # If no results returned,
|
|
165
|
+
100
|
|
166
|
+
- log.debug "Page contains no results"
|
|
167
|
+
101
|
|
168
|
+
- break
|
|
169
|
+
102
|
|
170
|
+
- else
|
|
171
|
+
103
|
|
172
|
+
- # No links, but also no "found no results" page. throw error
|
|
173
|
+
104
|
|
174
|
+
- log.debug doc.to_s
|
|
175
|
+
105
|
|
176
|
+
- log.debug "Raising: PageContainsNoLinks"
|
|
177
|
+
106
|
|
178
|
+
- raise "PageContainsNoLinks"
|
|
179
|
+
107
|
|
180
|
+
- end
|
|
181
|
+
108
|
|
182
|
+
- end
|
|
183
|
+
109 67
|
|
184
|
+
|
|
185
|
+
110
|
|
186
|
+
- links.each do |link|
|
|
187
|
+
111
|
|
188
|
+
-
|
|
189
|
+
112
|
|
190
|
+
- # Remove protocol prefix
|
|
191
|
+
113
|
|
192
|
+
- to_remove = ["http://", "https://"]
|
|
193
|
+
114
|
|
194
|
+
- reg = Regexp.new(to_remove.map{ |s| "(#{s})" }.join('|'))
|
|
195
|
+
115
|
|
196
|
+
- link = link['href'].gsub(reg, '')
|
|
197
|
+
116 68
|
|
198
|
+
|
|
199
|
+
117
|
|
200
|
+
- # If link start with '/url?q=', remove it
|
|
201
|
+
118
|
|
202
|
+
- if link[0..6] == '/url?q='
|
|
203
|
+
119
|
|
204
|
+
- link = link[7..-1]
|
|
205
|
+
120
|
|
206
|
+
- end
|
|
207
|
+
121
|
|
208
|
+
-
|
|
209
|
+
122
|
|
210
|
+
- # If link starts with 'www.', remove it
|
|
211
|
+
123
|
|
212
|
+
- if link[0..3] == 'www.'
|
|
213
|
+
124
|
|
214
|
+
- link = link[4..-1]
|
|
215
|
+
125
|
|
216
|
+
- end
|
|
217
|
+
126
|
|
218
|
+
-
|
|
219
|
+
127
|
|
220
|
+
- log.debug "link #{i}. '#{link[0..domain.length-1]}' against '#{domain}'"
|
|
221
|
+
69
|
|
222
|
+
+ def scrape=end
|
data/lib/crawlfish/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: crawlfish
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
hash:
|
|
4
|
+
hash: 25
|
|
5
5
|
prerelease: false
|
|
6
6
|
segments:
|
|
7
7
|
- 0
|
|
8
8
|
- 0
|
|
9
|
-
-
|
|
10
|
-
version: 0.0.
|
|
9
|
+
- 3
|
|
10
|
+
version: 0.0.3
|
|
11
11
|
platform: ruby
|
|
12
12
|
authors:
|
|
13
13
|
- Dan Neumann
|