sengi 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +7 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +71 -0
- data/Makefile +23 -0
- data/Makefile.common +58 -0
- data/README.md +59 -0
- data/Rakefile +25 -0
- data/bin/config +148 -0
- data/bin/crawler +64 -0
- data/bin/list +129 -0
- data/bin/redis_start +11 -0
- data/bin/redis_stats +13 -0
- data/bin/redis_stop +10 -0
- data/bin/resque_crawler_restart +14 -0
- data/bin/resque_crawler_start +21 -0
- data/bin/resque_crawler_stop +20 -0
- data/bin/resque_scheduler_start +15 -0
- data/bin/resque_scheduler_stop +16 -0
- data/bin/resque_server_start +13 -0
- data/bin/resque_server_stop +13 -0
- data/config/redis.conf +120 -0
- data/config/resque_server_config.rb +6 -0
- data/lib/sengi.rb +5 -0
- data/lib/sengi/crawler.rb +589 -0
- data/lib/sengi/crawler_worker.rb +16 -0
- data/lib/sengi/uri.rb +288 -0
- data/lib/sengi/version.rb +17 -0
- data/sengi.gemspec +37 -0
- data/sengi.sublime-project +10 -0
- data/tests/tc_crawler.rb +14 -0
- data/tests/tc_uri.rb +140 -0
- data/tests/ts_all.rb +4 -0
- metadata +202 -0
data/lib/sengi/uri.rb
ADDED
@@ -0,0 +1,288 @@
|
|
1
|
+
|
2
|
+
require 'uri'
|
3
|
+
require 'digest'
|
4
|
+
|
5
|
+
module TheFox
|
6
|
+
module Sengi
|
7
|
+
|
8
|
+
class Uri
|
9
|
+
|
10
|
+
URI_CLASSES = [URI::Generic, URI::HTTP, URI::HTTPS]
|
11
|
+
|
12
|
+
def initialize(url)
|
13
|
+
@uri = nil
|
14
|
+
@hash = nil
|
15
|
+
@is_blacklisted = nil
|
16
|
+
@is_ignored = nil
|
17
|
+
@is_ignored_reason = 'nil'
|
18
|
+
@hash_id_key_name = nil
|
19
|
+
@id = nil
|
20
|
+
@key_name = nil
|
21
|
+
@domain_nowww = nil
|
22
|
+
@domain_nowww_hash = nil
|
23
|
+
@domain_original_hash = nil
|
24
|
+
@domain_hash_id_key_name = nil
|
25
|
+
@domain_id = nil
|
26
|
+
@domain_key_name = nil
|
27
|
+
@request_id = nil
|
28
|
+
@request_key_name = nil
|
29
|
+
@response_id = nil
|
30
|
+
@response_key_name = nil
|
31
|
+
@response_size = 0
|
32
|
+
@response_content_type = ''
|
33
|
+
|
34
|
+
begin
|
35
|
+
@uri = URI(url)
|
36
|
+
rescue Exception => e
|
37
|
+
@uri = nil
|
38
|
+
end
|
39
|
+
|
40
|
+
validate
|
41
|
+
if is_valid?
|
42
|
+
append_slash
|
43
|
+
host_downcase
|
44
|
+
remove_fragment
|
45
|
+
domain_setup
|
46
|
+
|
47
|
+
@uri_class = @uri.class
|
48
|
+
@hash = Digest::SHA256.hexdigest(to_s)
|
49
|
+
@hash_id_key_name = "urls:id:#{@hash}"
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def is_valid?
|
54
|
+
!@uri.nil?
|
55
|
+
end
|
56
|
+
|
57
|
+
def ruri
|
58
|
+
@uri
|
59
|
+
end
|
60
|
+
|
61
|
+
def is_blacklisted=(is_blacklisted)
|
62
|
+
@is_blacklisted = is_blacklisted
|
63
|
+
end
|
64
|
+
|
65
|
+
def is_blacklisted
|
66
|
+
@is_blacklisted
|
67
|
+
end
|
68
|
+
|
69
|
+
def is_ignored=(is_ignored)
|
70
|
+
@is_ignored = is_ignored
|
71
|
+
end
|
72
|
+
|
73
|
+
def is_ignored
|
74
|
+
@is_ignored
|
75
|
+
end
|
76
|
+
|
77
|
+
def is_ignored_reason=(is_ignored_reason)
|
78
|
+
@is_ignored_reason = is_ignored_reason
|
79
|
+
end
|
80
|
+
|
81
|
+
def is_ignored_reason
|
82
|
+
@is_ignored_reason
|
83
|
+
end
|
84
|
+
|
85
|
+
# def hash_id_key_name=(hash_id_key_name)
|
86
|
+
# @hash_id_key_name = hash_id_key_name
|
87
|
+
# end
|
88
|
+
|
89
|
+
def hash_id_key_name
|
90
|
+
@hash_id_key_name
|
91
|
+
end
|
92
|
+
|
93
|
+
def id=(id)
|
94
|
+
@id = id
|
95
|
+
@key_name = "urls:#{@id}"
|
96
|
+
end
|
97
|
+
|
98
|
+
def id
|
99
|
+
@id
|
100
|
+
end
|
101
|
+
|
102
|
+
# def key_name=(key_name)
|
103
|
+
# @key_name = key_name
|
104
|
+
# end
|
105
|
+
|
106
|
+
def key_name
|
107
|
+
@key_name
|
108
|
+
end
|
109
|
+
|
110
|
+
def domain_nowww
|
111
|
+
@domain_nowww
|
112
|
+
end
|
113
|
+
|
114
|
+
def domain_nowww_hash
|
115
|
+
@domain_nowww_hash
|
116
|
+
end
|
117
|
+
|
118
|
+
def domain_original_hash
|
119
|
+
@domain_original_hash
|
120
|
+
end
|
121
|
+
|
122
|
+
def domain_hash_id_key_name
|
123
|
+
@domain_hash_id_key_name
|
124
|
+
end
|
125
|
+
|
126
|
+
def domain_id=(domain_id)
|
127
|
+
@domain_id = domain_id
|
128
|
+
@domain_key_name = "domains:#{@domain_id}"
|
129
|
+
end
|
130
|
+
|
131
|
+
def domain_id
|
132
|
+
@domain_id
|
133
|
+
end
|
134
|
+
|
135
|
+
def domain_key_name
|
136
|
+
@domain_key_name
|
137
|
+
end
|
138
|
+
|
139
|
+
def request_id=(request_id)
|
140
|
+
@request_id = request_id
|
141
|
+
@request_key_name = "requests:#{@request_id}"
|
142
|
+
end
|
143
|
+
|
144
|
+
def request_id
|
145
|
+
@request_id
|
146
|
+
end
|
147
|
+
|
148
|
+
def request_key_name
|
149
|
+
@request_key_name
|
150
|
+
end
|
151
|
+
|
152
|
+
def response_id=(response_id)
|
153
|
+
@response_id = response_id
|
154
|
+
@response_key_name = "responses:#{@response_id}"
|
155
|
+
end
|
156
|
+
|
157
|
+
def response_id
|
158
|
+
@response_id
|
159
|
+
end
|
160
|
+
|
161
|
+
def response_key_name
|
162
|
+
@response_key_name
|
163
|
+
end
|
164
|
+
|
165
|
+
def response_size=(response_size)
|
166
|
+
@response_size = response_size.to_s
|
167
|
+
end
|
168
|
+
|
169
|
+
def response_size
|
170
|
+
@response_size
|
171
|
+
end
|
172
|
+
|
173
|
+
def response_content_type=(response_content_type)
|
174
|
+
@response_content_type = response_content_type.to_s
|
175
|
+
end
|
176
|
+
|
177
|
+
def response_content_type
|
178
|
+
@response_content_type
|
179
|
+
end
|
180
|
+
|
181
|
+
def to_s
|
182
|
+
"#{@uri}"
|
183
|
+
end
|
184
|
+
|
185
|
+
def to_hash
|
186
|
+
@hash
|
187
|
+
end
|
188
|
+
|
189
|
+
def to_http
|
190
|
+
http_uri = @uri.clone
|
191
|
+
http_uri.scheme = 'http'
|
192
|
+
http_uri
|
193
|
+
end
|
194
|
+
|
195
|
+
def weight(ref_uri = nil)
|
196
|
+
is_subdomain = false
|
197
|
+
|
198
|
+
if !@uri.host.nil? && !ref_uri.nil? && !ref_uri.ruri.host.nil?
|
199
|
+
#puts "#{@uri.host}"
|
200
|
+
#puts "#{ref_uri.ruri.host}"
|
201
|
+
|
202
|
+
a_ss = @uri.host[ref_uri.ruri.host]
|
203
|
+
#puts "a: '#{a_ss}'"
|
204
|
+
|
205
|
+
if a_ss.nil?
|
206
|
+
b_ss = ref_uri.ruri.host[@uri.host]
|
207
|
+
#puts "b: '#{b_ss}'"
|
208
|
+
|
209
|
+
if !b_ss.nil?
|
210
|
+
is_subdomain = true
|
211
|
+
end
|
212
|
+
else
|
213
|
+
is_subdomain = true
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
if false
|
218
|
+
elsif @uri_class == URI::Generic then return 100
|
219
|
+
elsif @uri_class == URI::HTTP
|
220
|
+
if is_subdomain
|
221
|
+
return 200
|
222
|
+
end
|
223
|
+
return 250
|
224
|
+
elsif @uri_class == URI::HTTPS then return 290
|
225
|
+
end
|
226
|
+
return 999
|
227
|
+
end
|
228
|
+
|
229
|
+
def join(suburi)
|
230
|
+
self.class.new(URI.join(@uri, suburi.ruri).to_s)
|
231
|
+
end
|
232
|
+
|
233
|
+
def is_relative?(uri = nil)
|
234
|
+
@uri_class == URI::Generic ||
|
235
|
+
(!uri.nil? && uri.ruri.host == @uri.host)
|
236
|
+
end
|
237
|
+
|
238
|
+
private
|
239
|
+
|
240
|
+
def validate
|
241
|
+
if is_valid?
|
242
|
+
s = to_s.downcase
|
243
|
+
#puts "s '#{s[0..3]}'"
|
244
|
+
if s[0..10] == 'javascript:' ||
|
245
|
+
s[0..3] == 'tel:'
|
246
|
+
@uri = nil
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
if is_valid? && !URI_CLASSES.include?(@uri.class)
|
251
|
+
@uri = nil
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def append_slash
|
256
|
+
url = to_s
|
257
|
+
|
258
|
+
#puts "url: '#{@url}'"
|
259
|
+
#puts "request uri: '#{@uri.request_uri}'"
|
260
|
+
#puts "class: '#{@uri.class}'"
|
261
|
+
|
262
|
+
if @uri.class == URI::HTTP && @uri.request_uri == '/' && url[-1] != '/'
|
263
|
+
@uri = URI("#{url}/")
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
def host_downcase
|
268
|
+
if @uri.class != URI::Generic
|
269
|
+
@uri.host = @uri.host.downcase
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
def remove_fragment
|
274
|
+
@uri.fragment = nil
|
275
|
+
end
|
276
|
+
|
277
|
+
def domain_setup
|
278
|
+
if !@uri.nil? && !@uri.host.nil?
|
279
|
+
@domain_nowww = @uri.host.sub(/^www\./, '')
|
280
|
+
@domain_nowww_hash = Digest::SHA256.hexdigest(@domain_nowww)
|
281
|
+
@domain_original_hash = Digest::SHA256.hexdigest(@uri.host)
|
282
|
+
@domain_hash_id_key_name = "domains:id:#{@domain_nowww_hash}"
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
end
|
288
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
|
2
|
+
module TheFox
|
3
|
+
module Sengi
|
4
|
+
VERSION = '0.1.0'
|
5
|
+
DATE = '2016-05-07'
|
6
|
+
HOMEPAGE = 'https://github.com/TheFox/sengi'
|
7
|
+
|
8
|
+
#HTTP_USER_AGENT = "Sengi SearchENGIne/#{VERSION}"
|
9
|
+
HTTP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.108 Safari/537.36'
|
10
|
+
|
11
|
+
HTTP_REFERER = 'https://www.google.com/'
|
12
|
+
|
13
|
+
URL_DELAY = 2
|
14
|
+
URL_SEPARATE_DELAY = 5
|
15
|
+
URL_RESCHEDULE = 300
|
16
|
+
end
|
17
|
+
end
|
data/sengi.gemspec
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# coding: UTF-8
|
2
|
+
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
|
6
|
+
require 'sengi/version'
|
7
|
+
|
8
|
+
Gem::Specification.new do |spec|
|
9
|
+
spec.name = 'sengi'
|
10
|
+
spec.version = TheFox::Sengi::VERSION
|
11
|
+
spec.date = TheFox::Sengi::DATE
|
12
|
+
spec.author = 'Christian Mayer'
|
13
|
+
spec.email = 'christian@fox21.at'
|
14
|
+
|
15
|
+
spec.summary = %q{Sengi Web Crawler}
|
16
|
+
spec.description = %q{A web crawler using Ruby and Redis.}
|
17
|
+
spec.homepage = TheFox::Sengi::HOMEPAGE
|
18
|
+
spec.license = 'GPL-3.0'
|
19
|
+
|
20
|
+
spec.files = `git ls-files -z`.split("\x0").reject{ |f| f.match(%r{^(test|spec|features)/}) }
|
21
|
+
spec.bindir = 'bin'
|
22
|
+
spec.executables = []
|
23
|
+
spec.require_paths = ['lib']
|
24
|
+
spec.required_ruby_version = '>=2.1.0'
|
25
|
+
|
26
|
+
spec.add_development_dependency 'minitest', '~>5.8'
|
27
|
+
|
28
|
+
spec.add_dependency 'activesupport', '~>4.2'
|
29
|
+
spec.add_dependency 'redis', '~>3.2'
|
30
|
+
spec.add_dependency 'hiredis', '~>0.6'
|
31
|
+
spec.add_dependency 'resque', '~>1.26'
|
32
|
+
spec.add_dependency 'resque-scheduler', '~>4.1'
|
33
|
+
spec.add_dependency 'nokogiri', '~>1.6'
|
34
|
+
spec.add_dependency 'cookiejar', '~>0.3'
|
35
|
+
|
36
|
+
spec.add_dependency 'thefox-ext', '~>1.4'
|
37
|
+
end
|
data/tests/tc_crawler.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require 'sengi'
|
5
|
+
|
6
|
+
|
7
|
+
class TestCrawler < MiniTest::Test
|
8
|
+
def test_base
|
9
|
+
crawler = TheFox::Sengi::Crawler.new(nil, 0, 0)
|
10
|
+
|
11
|
+
assert_equal('TheFox::Sengi::Crawler', crawler.class.to_s)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
data/tests/tc_uri.rb
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'minitest/autorun'
|
4
|
+
require 'sengi'
|
5
|
+
|
6
|
+
|
7
|
+
class TestUri < MiniTest::Test
|
8
|
+
def test_base
|
9
|
+
uri = TheFox::Sengi::Uri.new('http://example.com')
|
10
|
+
|
11
|
+
assert_equal('TheFox::Sengi::Uri', uri.class.to_s)
|
12
|
+
assert_equal('URI::HTTP', uri.ruri.class.to_s)
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_string
|
16
|
+
uri = TheFox::Sengi::Uri.new('http://example.com')
|
17
|
+
assert_equal('http://example.com/', "#{uri}")
|
18
|
+
assert_equal('http://example.com/', uri.to_s)
|
19
|
+
|
20
|
+
uri = TheFox::Sengi::Uri.new('http://example.com/')
|
21
|
+
assert_equal('http://example.com/', uri.to_s)
|
22
|
+
|
23
|
+
uri = TheFox::Sengi::Uri.new('http://example.com/subdir1/')
|
24
|
+
assert_equal('http://example.com/subdir1/', uri.to_s)
|
25
|
+
|
26
|
+
uri = TheFox::Sengi::Uri.new('http://example.com/subdir2')
|
27
|
+
assert_equal('http://example.com/subdir2', uri.to_s)
|
28
|
+
|
29
|
+
uri = TheFox::Sengi::Uri.new('http://example.com/subdir2.html')
|
30
|
+
assert_equal('http://example.com/subdir2.html', uri.to_s)
|
31
|
+
|
32
|
+
uri = TheFox::Sengi::Uri.new('/subdir2.html')
|
33
|
+
assert_equal('/subdir2.html', uri.to_s)
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_hash
|
37
|
+
uri = TheFox::Sengi::Uri.new('http://www.example.com/index.html')
|
38
|
+
|
39
|
+
assert_equal('b1ae8ba07f44d280254af4d1db914de03ce87b027e1c291ffcb9211c7712c9d1', uri.to_hash)
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_id
|
43
|
+
uri = TheFox::Sengi::Uri.new('http://www.example.com/index.html')
|
44
|
+
|
45
|
+
uri.id = 21
|
46
|
+
assert_equal(21, uri.id)
|
47
|
+
assert_equal('urls:21', uri.key_name)
|
48
|
+
|
49
|
+
uri.id = 24
|
50
|
+
assert_equal(24, uri.id)
|
51
|
+
assert_equal('urls:24', uri.key_name)
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_valid
|
55
|
+
uri = TheFox::Sengi::Uri.new('http://example.com')
|
56
|
+
assert_equal(true, uri.is_valid?)
|
57
|
+
|
58
|
+
uri = TheFox::Sengi::Uri.new('javascript:alert(1);')
|
59
|
+
assert_equal(false, uri.is_valid?)
|
60
|
+
|
61
|
+
uri = TheFox::Sengi::Uri.new('tel:+43501234567890')
|
62
|
+
assert_equal(false, uri.is_valid?)
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_to_http
|
66
|
+
uri_http = TheFox::Sengi::Uri.new('http://www.example.com/index.html')
|
67
|
+
uri_https = TheFox::Sengi::Uri.new('https://www.example.com/index.html')
|
68
|
+
assert_equal(uri_http.to_http.to_s, uri_https.to_http.to_s)
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_weight
|
72
|
+
uri = TheFox::Sengi::Uri.new('http://www.example1.com/index.html')
|
73
|
+
|
74
|
+
suburi = TheFox::Sengi::Uri.new('test.html')
|
75
|
+
assert_equal(100, suburi.weight(uri))
|
76
|
+
|
77
|
+
suburi = TheFox::Sengi::Uri.new('http://sub.www.example1.com')
|
78
|
+
assert_equal(200, suburi.weight(uri))
|
79
|
+
|
80
|
+
suburi = TheFox::Sengi::Uri.new('http://sub.example1.com')
|
81
|
+
assert_equal(250, suburi.weight(uri))
|
82
|
+
|
83
|
+
suburi = TheFox::Sengi::Uri.new('http://www.example2.com')
|
84
|
+
assert_equal(250, suburi.weight(uri))
|
85
|
+
|
86
|
+
suburi = TheFox::Sengi::Uri.new('https://www.example2.com')
|
87
|
+
assert_equal(290, suburi.weight(uri))
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_join
|
91
|
+
uri1 = TheFox::Sengi::Uri.new('http://www.example.com')
|
92
|
+
uri2 = TheFox::Sengi::Uri.new('index.html')
|
93
|
+
uri3 = uri1.join(uri2)
|
94
|
+
assert_equal('http://www.example.com/', uri1.to_s)
|
95
|
+
assert_equal('index.html', uri2.to_s)
|
96
|
+
assert_equal('http://www.example.com/index.html', uri3.to_s)
|
97
|
+
|
98
|
+
uri1 = TheFox::Sengi::Uri.new('http://www.example.com/test1')
|
99
|
+
uri2 = TheFox::Sengi::Uri.new('../test2.html')
|
100
|
+
uri3 = uri1.join(uri2)
|
101
|
+
assert_equal('http://www.example.com/test1', uri1.to_s)
|
102
|
+
assert_equal('../test2.html', uri2.to_s)
|
103
|
+
assert_equal('http://www.example.com/test2.html', uri3.to_s)
|
104
|
+
|
105
|
+
uri1 = TheFox::Sengi::Uri.new('http://www.example1.com/test1.html')
|
106
|
+
uri2 = TheFox::Sengi::Uri.new('http://www.example2.com/test2.html')
|
107
|
+
uri3 = uri1.join(uri2)
|
108
|
+
assert_equal('http://www.example1.com/test1.html', uri1.to_s)
|
109
|
+
assert_equal('http://www.example2.com/test2.html', uri2.to_s)
|
110
|
+
assert_equal('http://www.example2.com/test2.html', uri3.to_s)
|
111
|
+
end
|
112
|
+
|
113
|
+
def test_is_relative
|
114
|
+
uri1 = TheFox::Sengi::Uri.new('index1.html')
|
115
|
+
uri2 = TheFox::Sengi::Uri.new('index2.html')
|
116
|
+
assert_equal(true, uri2.is_relative?(uri1))
|
117
|
+
|
118
|
+
uri1 = TheFox::Sengi::Uri.new('http://www.example.com')
|
119
|
+
uri2 = TheFox::Sengi::Uri.new('index.html')
|
120
|
+
assert_equal(true, uri2.is_relative?(uri1))
|
121
|
+
|
122
|
+
uri1 = TheFox::Sengi::Uri.new('http://www.example1.com/index.html')
|
123
|
+
uri2 = TheFox::Sengi::Uri.new('http://www.example2.com/index.html')
|
124
|
+
assert_equal(false, uri2.is_relative?(uri1))
|
125
|
+
end
|
126
|
+
|
127
|
+
def test_host_downcase
|
128
|
+
uri = TheFox::Sengi::Uri.new('http://www.EXAMPLE.com/Index.html')
|
129
|
+
|
130
|
+
assert_equal('http://www.example.com/Index.html', uri.to_s)
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_fragment
|
134
|
+
uri = TheFox::Sengi::Uri.new('http://example.com/index.html#test')
|
135
|
+
assert_equal('http://example.com/index.html', uri.to_s)
|
136
|
+
|
137
|
+
uri = TheFox::Sengi::Uri.new('index.html#test')
|
138
|
+
assert_equal('index.html', uri.to_s)
|
139
|
+
end
|
140
|
+
end
|