sengi 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,16 @@
1
+
2
+ module TheFox
3
+ module Sengi
4
+
5
+ class CrawlerWorker
6
+ @queue = :crawler
7
+
8
+ def self.perform(url, options)
9
+ crawler = Crawler.new(url, options)
10
+ crawler.go
11
+ end
12
+
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,288 @@
1
+
2
+ require 'uri'
3
+ require 'digest'
4
+
5
+ module TheFox
6
+ module Sengi
7
+
8
+ class Uri
9
+
10
+ URI_CLASSES = [URI::Generic, URI::HTTP, URI::HTTPS]
11
+
12
+ def initialize(url)
13
+ @uri = nil
14
+ @hash = nil
15
+ @is_blacklisted = nil
16
+ @is_ignored = nil
17
+ @is_ignored_reason = 'nil'
18
+ @hash_id_key_name = nil
19
+ @id = nil
20
+ @key_name = nil
21
+ @domain_nowww = nil
22
+ @domain_nowww_hash = nil
23
+ @domain_original_hash = nil
24
+ @domain_hash_id_key_name = nil
25
+ @domain_id = nil
26
+ @domain_key_name = nil
27
+ @request_id = nil
28
+ @request_key_name = nil
29
+ @response_id = nil
30
+ @response_key_name = nil
31
+ @response_size = 0
32
+ @response_content_type = ''
33
+
34
+ begin
35
+ @uri = URI(url)
36
+ rescue Exception => e
37
+ @uri = nil
38
+ end
39
+
40
+ validate
41
+ if is_valid?
42
+ append_slash
43
+ host_downcase
44
+ remove_fragment
45
+ domain_setup
46
+
47
+ @uri_class = @uri.class
48
+ @hash = Digest::SHA256.hexdigest(to_s)
49
+ @hash_id_key_name = "urls:id:#{@hash}"
50
+ end
51
+ end
52
+
53
+ def is_valid?
54
+ !@uri.nil?
55
+ end
56
+
57
+ def ruri
58
+ @uri
59
+ end
60
+
61
+ def is_blacklisted=(is_blacklisted)
62
+ @is_blacklisted = is_blacklisted
63
+ end
64
+
65
+ def is_blacklisted
66
+ @is_blacklisted
67
+ end
68
+
69
+ def is_ignored=(is_ignored)
70
+ @is_ignored = is_ignored
71
+ end
72
+
73
+ def is_ignored
74
+ @is_ignored
75
+ end
76
+
77
+ def is_ignored_reason=(is_ignored_reason)
78
+ @is_ignored_reason = is_ignored_reason
79
+ end
80
+
81
+ def is_ignored_reason
82
+ @is_ignored_reason
83
+ end
84
+
85
+ # def hash_id_key_name=(hash_id_key_name)
86
+ # @hash_id_key_name = hash_id_key_name
87
+ # end
88
+
89
+ def hash_id_key_name
90
+ @hash_id_key_name
91
+ end
92
+
93
+ def id=(id)
94
+ @id = id
95
+ @key_name = "urls:#{@id}"
96
+ end
97
+
98
+ def id
99
+ @id
100
+ end
101
+
102
+ # def key_name=(key_name)
103
+ # @key_name = key_name
104
+ # end
105
+
106
+ def key_name
107
+ @key_name
108
+ end
109
+
110
+ def domain_nowww
111
+ @domain_nowww
112
+ end
113
+
114
+ def domain_nowww_hash
115
+ @domain_nowww_hash
116
+ end
117
+
118
+ def domain_original_hash
119
+ @domain_original_hash
120
+ end
121
+
122
+ def domain_hash_id_key_name
123
+ @domain_hash_id_key_name
124
+ end
125
+
126
+ def domain_id=(domain_id)
127
+ @domain_id = domain_id
128
+ @domain_key_name = "domains:#{@domain_id}"
129
+ end
130
+
131
+ def domain_id
132
+ @domain_id
133
+ end
134
+
135
+ def domain_key_name
136
+ @domain_key_name
137
+ end
138
+
139
+ def request_id=(request_id)
140
+ @request_id = request_id
141
+ @request_key_name = "requests:#{@request_id}"
142
+ end
143
+
144
+ def request_id
145
+ @request_id
146
+ end
147
+
148
+ def request_key_name
149
+ @request_key_name
150
+ end
151
+
152
+ def response_id=(response_id)
153
+ @response_id = response_id
154
+ @response_key_name = "responses:#{@response_id}"
155
+ end
156
+
157
+ def response_id
158
+ @response_id
159
+ end
160
+
161
+ def response_key_name
162
+ @response_key_name
163
+ end
164
+
165
+ def response_size=(response_size)
166
+ @response_size = response_size.to_s
167
+ end
168
+
169
+ def response_size
170
+ @response_size
171
+ end
172
+
173
+ def response_content_type=(response_content_type)
174
+ @response_content_type = response_content_type.to_s
175
+ end
176
+
177
+ def response_content_type
178
+ @response_content_type
179
+ end
180
+
181
+ def to_s
182
+ "#{@uri}"
183
+ end
184
+
185
+ def to_hash
186
+ @hash
187
+ end
188
+
189
+ def to_http
190
+ http_uri = @uri.clone
191
+ http_uri.scheme = 'http'
192
+ http_uri
193
+ end
194
+
195
+ def weight(ref_uri = nil)
196
+ is_subdomain = false
197
+
198
+ if !@uri.host.nil? && !ref_uri.nil? && !ref_uri.ruri.host.nil?
199
+ #puts "#{@uri.host}"
200
+ #puts "#{ref_uri.ruri.host}"
201
+
202
+ a_ss = @uri.host[ref_uri.ruri.host]
203
+ #puts "a: '#{a_ss}'"
204
+
205
+ if a_ss.nil?
206
+ b_ss = ref_uri.ruri.host[@uri.host]
207
+ #puts "b: '#{b_ss}'"
208
+
209
+ if !b_ss.nil?
210
+ is_subdomain = true
211
+ end
212
+ else
213
+ is_subdomain = true
214
+ end
215
+ end
216
+
217
+ if false
218
+ elsif @uri_class == URI::Generic then return 100
219
+ elsif @uri_class == URI::HTTP
220
+ if is_subdomain
221
+ return 200
222
+ end
223
+ return 250
224
+ elsif @uri_class == URI::HTTPS then return 290
225
+ end
226
+ return 999
227
+ end
228
+
229
+ def join(suburi)
230
+ self.class.new(URI.join(@uri, suburi.ruri).to_s)
231
+ end
232
+
233
+ def is_relative?(uri = nil)
234
+ @uri_class == URI::Generic ||
235
+ (!uri.nil? && uri.ruri.host == @uri.host)
236
+ end
237
+
238
+ private
239
+
240
+ def validate
241
+ if is_valid?
242
+ s = to_s.downcase
243
+ #puts "s '#{s[0..3]}'"
244
+ if s[0..10] == 'javascript:' ||
245
+ s[0..3] == 'tel:'
246
+ @uri = nil
247
+ end
248
+ end
249
+
250
+ if is_valid? && !URI_CLASSES.include?(@uri.class)
251
+ @uri = nil
252
+ end
253
+ end
254
+
255
+ def append_slash
256
+ url = to_s
257
+
258
+ #puts "url: '#{@url}'"
259
+ #puts "request uri: '#{@uri.request_uri}'"
260
+ #puts "class: '#{@uri.class}'"
261
+
262
+ if @uri.class == URI::HTTP && @uri.request_uri == '/' && url[-1] != '/'
263
+ @uri = URI("#{url}/")
264
+ end
265
+ end
266
+
267
+ def host_downcase
268
+ if @uri.class != URI::Generic
269
+ @uri.host = @uri.host.downcase
270
+ end
271
+ end
272
+
273
+ def remove_fragment
274
+ @uri.fragment = nil
275
+ end
276
+
277
+ def domain_setup
278
+ if !@uri.nil? && !@uri.host.nil?
279
+ @domain_nowww = @uri.host.sub(/^www\./, '')
280
+ @domain_nowww_hash = Digest::SHA256.hexdigest(@domain_nowww)
281
+ @domain_original_hash = Digest::SHA256.hexdigest(@uri.host)
282
+ @domain_hash_id_key_name = "domains:id:#{@domain_nowww_hash}"
283
+ end
284
+ end
285
+ end
286
+
287
+ end
288
+ end
@@ -0,0 +1,17 @@
1
+
2
+ module TheFox
3
+ module Sengi
4
+ VERSION = '0.1.0'
5
+ DATE = '2016-05-07'
6
+ HOMEPAGE = 'https://github.com/TheFox/sengi'
7
+
8
+ #HTTP_USER_AGENT = "Sengi SearchENGIne/#{VERSION}"
9
+ HTTP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.108 Safari/537.36'
10
+
11
+ HTTP_REFERER = 'https://www.google.com/'
12
+
13
+ URL_DELAY = 2
14
+ URL_SEPARATE_DELAY = 5
15
+ URL_RESCHEDULE = 300
16
+ end
17
+ end
@@ -0,0 +1,37 @@
1
+ # coding: UTF-8
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+
6
+ require 'sengi/version'
7
+
8
+ Gem::Specification.new do |spec|
9
+ spec.name = 'sengi'
10
+ spec.version = TheFox::Sengi::VERSION
11
+ spec.date = TheFox::Sengi::DATE
12
+ spec.author = 'Christian Mayer'
13
+ spec.email = 'christian@fox21.at'
14
+
15
+ spec.summary = %q{Sengi Web Crawler}
16
+ spec.description = %q{A web crawler using Ruby and Redis.}
17
+ spec.homepage = TheFox::Sengi::HOMEPAGE
18
+ spec.license = 'GPL-3.0'
19
+
20
+ spec.files = `git ls-files -z`.split("\x0").reject{ |f| f.match(%r{^(test|spec|features)/}) }
21
+ spec.bindir = 'bin'
22
+ spec.executables = []
23
+ spec.require_paths = ['lib']
24
+ spec.required_ruby_version = '>=2.1.0'
25
+
26
+ spec.add_development_dependency 'minitest', '~>5.8'
27
+
28
+ spec.add_dependency 'activesupport', '~>4.2'
29
+ spec.add_dependency 'redis', '~>3.2'
30
+ spec.add_dependency 'hiredis', '~>0.6'
31
+ spec.add_dependency 'resque', '~>1.26'
32
+ spec.add_dependency 'resque-scheduler', '~>4.1'
33
+ spec.add_dependency 'nokogiri', '~>1.6'
34
+ spec.add_dependency 'cookiejar', '~>0.3'
35
+
36
+ spec.add_dependency 'thefox-ext', '~>1.4'
37
+ end
@@ -0,0 +1,10 @@
1
+ {
2
+ "folders":[
3
+ {
4
+ "path": ".",
5
+ "name": "Sengi",
6
+ "folder_exclude_patterns": [ ],
7
+ "file_exclude_patterns": [ ]
8
+ }
9
+ ]
10
+ }
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'minitest/autorun'
4
+ require 'sengi'
5
+
6
+
7
+ class TestCrawler < MiniTest::Test
8
+ def test_base
9
+ crawler = TheFox::Sengi::Crawler.new(nil, 0, 0)
10
+
11
+ assert_equal('TheFox::Sengi::Crawler', crawler.class.to_s)
12
+ end
13
+ end
14
+
@@ -0,0 +1,140 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'minitest/autorun'
4
+ require 'sengi'
5
+
6
+
7
+ class TestUri < MiniTest::Test
8
+ def test_base
9
+ uri = TheFox::Sengi::Uri.new('http://example.com')
10
+
11
+ assert_equal('TheFox::Sengi::Uri', uri.class.to_s)
12
+ assert_equal('URI::HTTP', uri.ruri.class.to_s)
13
+ end
14
+
15
+ def test_string
16
+ uri = TheFox::Sengi::Uri.new('http://example.com')
17
+ assert_equal('http://example.com/', "#{uri}")
18
+ assert_equal('http://example.com/', uri.to_s)
19
+
20
+ uri = TheFox::Sengi::Uri.new('http://example.com/')
21
+ assert_equal('http://example.com/', uri.to_s)
22
+
23
+ uri = TheFox::Sengi::Uri.new('http://example.com/subdir1/')
24
+ assert_equal('http://example.com/subdir1/', uri.to_s)
25
+
26
+ uri = TheFox::Sengi::Uri.new('http://example.com/subdir2')
27
+ assert_equal('http://example.com/subdir2', uri.to_s)
28
+
29
+ uri = TheFox::Sengi::Uri.new('http://example.com/subdir2.html')
30
+ assert_equal('http://example.com/subdir2.html', uri.to_s)
31
+
32
+ uri = TheFox::Sengi::Uri.new('/subdir2.html')
33
+ assert_equal('/subdir2.html', uri.to_s)
34
+ end
35
+
36
+ def test_hash
37
+ uri = TheFox::Sengi::Uri.new('http://www.example.com/index.html')
38
+
39
+ assert_equal('b1ae8ba07f44d280254af4d1db914de03ce87b027e1c291ffcb9211c7712c9d1', uri.to_hash)
40
+ end
41
+
42
+ def test_id
43
+ uri = TheFox::Sengi::Uri.new('http://www.example.com/index.html')
44
+
45
+ uri.id = 21
46
+ assert_equal(21, uri.id)
47
+ assert_equal('urls:21', uri.key_name)
48
+
49
+ uri.id = 24
50
+ assert_equal(24, uri.id)
51
+ assert_equal('urls:24', uri.key_name)
52
+ end
53
+
54
+ def test_valid
55
+ uri = TheFox::Sengi::Uri.new('http://example.com')
56
+ assert_equal(true, uri.is_valid?)
57
+
58
+ uri = TheFox::Sengi::Uri.new('javascript:alert(1);')
59
+ assert_equal(false, uri.is_valid?)
60
+
61
+ uri = TheFox::Sengi::Uri.new('tel:+43501234567890')
62
+ assert_equal(false, uri.is_valid?)
63
+ end
64
+
65
+ def test_to_http
66
+ uri_http = TheFox::Sengi::Uri.new('http://www.example.com/index.html')
67
+ uri_https = TheFox::Sengi::Uri.new('https://www.example.com/index.html')
68
+ assert_equal(uri_http.to_http.to_s, uri_https.to_http.to_s)
69
+ end
70
+
71
+ def test_weight
72
+ uri = TheFox::Sengi::Uri.new('http://www.example1.com/index.html')
73
+
74
+ suburi = TheFox::Sengi::Uri.new('test.html')
75
+ assert_equal(100, suburi.weight(uri))
76
+
77
+ suburi = TheFox::Sengi::Uri.new('http://sub.www.example1.com')
78
+ assert_equal(200, suburi.weight(uri))
79
+
80
+ suburi = TheFox::Sengi::Uri.new('http://sub.example1.com')
81
+ assert_equal(250, suburi.weight(uri))
82
+
83
+ suburi = TheFox::Sengi::Uri.new('http://www.example2.com')
84
+ assert_equal(250, suburi.weight(uri))
85
+
86
+ suburi = TheFox::Sengi::Uri.new('https://www.example2.com')
87
+ assert_equal(290, suburi.weight(uri))
88
+ end
89
+
90
+ def test_join
91
+ uri1 = TheFox::Sengi::Uri.new('http://www.example.com')
92
+ uri2 = TheFox::Sengi::Uri.new('index.html')
93
+ uri3 = uri1.join(uri2)
94
+ assert_equal('http://www.example.com/', uri1.to_s)
95
+ assert_equal('index.html', uri2.to_s)
96
+ assert_equal('http://www.example.com/index.html', uri3.to_s)
97
+
98
+ uri1 = TheFox::Sengi::Uri.new('http://www.example.com/test1')
99
+ uri2 = TheFox::Sengi::Uri.new('../test2.html')
100
+ uri3 = uri1.join(uri2)
101
+ assert_equal('http://www.example.com/test1', uri1.to_s)
102
+ assert_equal('../test2.html', uri2.to_s)
103
+ assert_equal('http://www.example.com/test2.html', uri3.to_s)
104
+
105
+ uri1 = TheFox::Sengi::Uri.new('http://www.example1.com/test1.html')
106
+ uri2 = TheFox::Sengi::Uri.new('http://www.example2.com/test2.html')
107
+ uri3 = uri1.join(uri2)
108
+ assert_equal('http://www.example1.com/test1.html', uri1.to_s)
109
+ assert_equal('http://www.example2.com/test2.html', uri2.to_s)
110
+ assert_equal('http://www.example2.com/test2.html', uri3.to_s)
111
+ end
112
+
113
+ def test_is_relative
114
+ uri1 = TheFox::Sengi::Uri.new('index1.html')
115
+ uri2 = TheFox::Sengi::Uri.new('index2.html')
116
+ assert_equal(true, uri2.is_relative?(uri1))
117
+
118
+ uri1 = TheFox::Sengi::Uri.new('http://www.example.com')
119
+ uri2 = TheFox::Sengi::Uri.new('index.html')
120
+ assert_equal(true, uri2.is_relative?(uri1))
121
+
122
+ uri1 = TheFox::Sengi::Uri.new('http://www.example1.com/index.html')
123
+ uri2 = TheFox::Sengi::Uri.new('http://www.example2.com/index.html')
124
+ assert_equal(false, uri2.is_relative?(uri1))
125
+ end
126
+
127
+ def test_host_downcase
128
+ uri = TheFox::Sengi::Uri.new('http://www.EXAMPLE.com/Index.html')
129
+
130
+ assert_equal('http://www.example.com/Index.html', uri.to_s)
131
+ end
132
+
133
+ def test_fragment
134
+ uri = TheFox::Sengi::Uri.new('http://example.com/index.html#test')
135
+ assert_equal('http://example.com/index.html', uri.to_s)
136
+
137
+ uri = TheFox::Sengi::Uri.new('index.html#test')
138
+ assert_equal('index.html', uri.to_s)
139
+ end
140
+ end