sengi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+
2
+ module TheFox
3
+ module Sengi
4
+
5
+ class CrawlerWorker
6
+ @queue = :crawler
7
+
8
+ def self.perform(url, options)
9
+ crawler = Crawler.new(url, options)
10
+ crawler.go
11
+ end
12
+
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,288 @@
1
+
2
+ require 'uri'
3
+ require 'digest'
4
+
5
+ module TheFox
6
+ module Sengi
7
+
8
+ class Uri
9
+
10
+ URI_CLASSES = [URI::Generic, URI::HTTP, URI::HTTPS]
11
+
12
+ def initialize(url)
13
+ @uri = nil
14
+ @hash = nil
15
+ @is_blacklisted = nil
16
+ @is_ignored = nil
17
+ @is_ignored_reason = 'nil'
18
+ @hash_id_key_name = nil
19
+ @id = nil
20
+ @key_name = nil
21
+ @domain_nowww = nil
22
+ @domain_nowww_hash = nil
23
+ @domain_original_hash = nil
24
+ @domain_hash_id_key_name = nil
25
+ @domain_id = nil
26
+ @domain_key_name = nil
27
+ @request_id = nil
28
+ @request_key_name = nil
29
+ @response_id = nil
30
+ @response_key_name = nil
31
+ @response_size = 0
32
+ @response_content_type = ''
33
+
34
+ begin
35
+ @uri = URI(url)
36
+ rescue Exception => e
37
+ @uri = nil
38
+ end
39
+
40
+ validate
41
+ if is_valid?
42
+ append_slash
43
+ host_downcase
44
+ remove_fragment
45
+ domain_setup
46
+
47
+ @uri_class = @uri.class
48
+ @hash = Digest::SHA256.hexdigest(to_s)
49
+ @hash_id_key_name = "urls:id:#{@hash}"
50
+ end
51
+ end
52
+
53
+ def is_valid?
54
+ !@uri.nil?
55
+ end
56
+
57
+ def ruri
58
+ @uri
59
+ end
60
+
61
+ def is_blacklisted=(is_blacklisted)
62
+ @is_blacklisted = is_blacklisted
63
+ end
64
+
65
+ def is_blacklisted
66
+ @is_blacklisted
67
+ end
68
+
69
+ def is_ignored=(is_ignored)
70
+ @is_ignored = is_ignored
71
+ end
72
+
73
+ def is_ignored
74
+ @is_ignored
75
+ end
76
+
77
+ def is_ignored_reason=(is_ignored_reason)
78
+ @is_ignored_reason = is_ignored_reason
79
+ end
80
+
81
+ def is_ignored_reason
82
+ @is_ignored_reason
83
+ end
84
+
85
+ # def hash_id_key_name=(hash_id_key_name)
86
+ # @hash_id_key_name = hash_id_key_name
87
+ # end
88
+
89
+ def hash_id_key_name
90
+ @hash_id_key_name
91
+ end
92
+
93
+ def id=(id)
94
+ @id = id
95
+ @key_name = "urls:#{@id}"
96
+ end
97
+
98
+ def id
99
+ @id
100
+ end
101
+
102
+ # def key_name=(key_name)
103
+ # @key_name = key_name
104
+ # end
105
+
106
+ def key_name
107
+ @key_name
108
+ end
109
+
110
+ def domain_nowww
111
+ @domain_nowww
112
+ end
113
+
114
+ def domain_nowww_hash
115
+ @domain_nowww_hash
116
+ end
117
+
118
+ def domain_original_hash
119
+ @domain_original_hash
120
+ end
121
+
122
+ def domain_hash_id_key_name
123
+ @domain_hash_id_key_name
124
+ end
125
+
126
+ def domain_id=(domain_id)
127
+ @domain_id = domain_id
128
+ @domain_key_name = "domains:#{@domain_id}"
129
+ end
130
+
131
+ def domain_id
132
+ @domain_id
133
+ end
134
+
135
+ def domain_key_name
136
+ @domain_key_name
137
+ end
138
+
139
+ def request_id=(request_id)
140
+ @request_id = request_id
141
+ @request_key_name = "requests:#{@request_id}"
142
+ end
143
+
144
+ def request_id
145
+ @request_id
146
+ end
147
+
148
+ def request_key_name
149
+ @request_key_name
150
+ end
151
+
152
+ def response_id=(response_id)
153
+ @response_id = response_id
154
+ @response_key_name = "responses:#{@response_id}"
155
+ end
156
+
157
+ def response_id
158
+ @response_id
159
+ end
160
+
161
+ def response_key_name
162
+ @response_key_name
163
+ end
164
+
165
+ def response_size=(response_size)
166
+ @response_size = response_size.to_s
167
+ end
168
+
169
+ def response_size
170
+ @response_size
171
+ end
172
+
173
+ def response_content_type=(response_content_type)
174
+ @response_content_type = response_content_type.to_s
175
+ end
176
+
177
+ def response_content_type
178
+ @response_content_type
179
+ end
180
+
181
+ def to_s
182
+ "#{@uri}"
183
+ end
184
+
185
+ def to_hash
186
+ @hash
187
+ end
188
+
189
+ def to_http
190
+ http_uri = @uri.clone
191
+ http_uri.scheme = 'http'
192
+ http_uri
193
+ end
194
+
195
+ def weight(ref_uri = nil)
196
+ is_subdomain = false
197
+
198
+ if !@uri.host.nil? && !ref_uri.nil? && !ref_uri.ruri.host.nil?
199
+ #puts "#{@uri.host}"
200
+ #puts "#{ref_uri.ruri.host}"
201
+
202
+ a_ss = @uri.host[ref_uri.ruri.host]
203
+ #puts "a: '#{a_ss}'"
204
+
205
+ if a_ss.nil?
206
+ b_ss = ref_uri.ruri.host[@uri.host]
207
+ #puts "b: '#{b_ss}'"
208
+
209
+ if !b_ss.nil?
210
+ is_subdomain = true
211
+ end
212
+ else
213
+ is_subdomain = true
214
+ end
215
+ end
216
+
217
+ if false
218
+ elsif @uri_class == URI::Generic then return 100
219
+ elsif @uri_class == URI::HTTP
220
+ if is_subdomain
221
+ return 200
222
+ end
223
+ return 250
224
+ elsif @uri_class == URI::HTTPS then return 290
225
+ end
226
+ return 999
227
+ end
228
+
229
+ def join(suburi)
230
+ self.class.new(URI.join(@uri, suburi.ruri).to_s)
231
+ end
232
+
233
+ def is_relative?(uri = nil)
234
+ @uri_class == URI::Generic ||
235
+ (!uri.nil? && uri.ruri.host == @uri.host)
236
+ end
237
+
238
+ private
239
+
240
+ def validate
241
+ if is_valid?
242
+ s = to_s.downcase
243
+ #puts "s '#{s[0..3]}'"
244
+ if s[0..10] == 'javascript:' ||
245
+ s[0..3] == 'tel:'
246
+ @uri = nil
247
+ end
248
+ end
249
+
250
+ if is_valid? && !URI_CLASSES.include?(@uri.class)
251
+ @uri = nil
252
+ end
253
+ end
254
+
255
+ def append_slash
256
+ url = to_s
257
+
258
+ #puts "url: '#{@url}'"
259
+ #puts "request uri: '#{@uri.request_uri}'"
260
+ #puts "class: '#{@uri.class}'"
261
+
262
+ if @uri.class == URI::HTTP && @uri.request_uri == '/' && url[-1] != '/'
263
+ @uri = URI("#{url}/")
264
+ end
265
+ end
266
+
267
+ def host_downcase
268
+ if @uri.class != URI::Generic
269
+ @uri.host = @uri.host.downcase
270
+ end
271
+ end
272
+
273
+ def remove_fragment
274
+ @uri.fragment = nil
275
+ end
276
+
277
+ def domain_setup
278
+ if !@uri.nil? && !@uri.host.nil?
279
+ @domain_nowww = @uri.host.sub(/^www\./, '')
280
+ @domain_nowww_hash = Digest::SHA256.hexdigest(@domain_nowww)
281
+ @domain_original_hash = Digest::SHA256.hexdigest(@uri.host)
282
+ @domain_hash_id_key_name = "domains:id:#{@domain_nowww_hash}"
283
+ end
284
+ end
285
+ end
286
+
287
+ end
288
+ end
@@ -0,0 +1,17 @@
1
+
2
+ module TheFox
3
+ module Sengi
4
+ VERSION = '0.1.0'
5
+ DATE = '2016-05-07'
6
+ HOMEPAGE = 'https://github.com/TheFox/sengi'
7
+
8
+ #HTTP_USER_AGENT = "Sengi SearchENGIne/#{VERSION}"
9
+ HTTP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.108 Safari/537.36'
10
+
11
+ HTTP_REFERER = 'https://www.google.com/'
12
+
13
+ URL_DELAY = 2
14
+ URL_SEPARATE_DELAY = 5
15
+ URL_RESCHEDULE = 300
16
+ end
17
+ end
@@ -0,0 +1,37 @@
1
+ # coding: UTF-8
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+
6
+ require 'sengi/version'
7
+
8
+ Gem::Specification.new do |spec|
9
+ spec.name = 'sengi'
10
+ spec.version = TheFox::Sengi::VERSION
11
+ spec.date = TheFox::Sengi::DATE
12
+ spec.author = 'Christian Mayer'
13
+ spec.email = 'christian@fox21.at'
14
+
15
+ spec.summary = %q{Sengi Web Crawler}
16
+ spec.description = %q{A web crawler using Ruby and Redis.}
17
+ spec.homepage = TheFox::Sengi::HOMEPAGE
18
+ spec.license = 'GPL-3.0'
19
+
20
+ spec.files = `git ls-files -z`.split("\x0").reject{ |f| f.match(%r{^(test|spec|features)/}) }
21
+ spec.bindir = 'bin'
22
+ spec.executables = []
23
+ spec.require_paths = ['lib']
24
+ spec.required_ruby_version = '>=2.1.0'
25
+
26
+ spec.add_development_dependency 'minitest', '~>5.8'
27
+
28
+ spec.add_dependency 'activesupport', '~>4.2'
29
+ spec.add_dependency 'redis', '~>3.2'
30
+ spec.add_dependency 'hiredis', '~>0.6'
31
+ spec.add_dependency 'resque', '~>1.26'
32
+ spec.add_dependency 'resque-scheduler', '~>4.1'
33
+ spec.add_dependency 'nokogiri', '~>1.6'
34
+ spec.add_dependency 'cookiejar', '~>0.3'
35
+
36
+ spec.add_dependency 'thefox-ext', '~>1.4'
37
+ end
@@ -0,0 +1,10 @@
1
+ {
2
+ "folders":[
3
+ {
4
+ "path": ".",
5
+ "name": "Sengi",
6
+ "folder_exclude_patterns": [ ],
7
+ "file_exclude_patterns": [ ]
8
+ }
9
+ ]
10
+ }
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'minitest/autorun'
4
+ require 'sengi'
5
+
6
+
7
+ class TestCrawler < MiniTest::Test
8
+ def test_base
9
+ crawler = TheFox::Sengi::Crawler.new(nil, 0, 0)
10
+
11
+ assert_equal('TheFox::Sengi::Crawler', crawler.class.to_s)
12
+ end
13
+ end
14
+
@@ -0,0 +1,140 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'minitest/autorun'
4
+ require 'sengi'
5
+
6
+
7
+ class TestUri < MiniTest::Test
8
+ def test_base
9
+ uri = TheFox::Sengi::Uri.new('http://example.com')
10
+
11
+ assert_equal('TheFox::Sengi::Uri', uri.class.to_s)
12
+ assert_equal('URI::HTTP', uri.ruri.class.to_s)
13
+ end
14
+
15
+ def test_string
16
+ uri = TheFox::Sengi::Uri.new('http://example.com')
17
+ assert_equal('http://example.com/', "#{uri}")
18
+ assert_equal('http://example.com/', uri.to_s)
19
+
20
+ uri = TheFox::Sengi::Uri.new('http://example.com/')
21
+ assert_equal('http://example.com/', uri.to_s)
22
+
23
+ uri = TheFox::Sengi::Uri.new('http://example.com/subdir1/')
24
+ assert_equal('http://example.com/subdir1/', uri.to_s)
25
+
26
+ uri = TheFox::Sengi::Uri.new('http://example.com/subdir2')
27
+ assert_equal('http://example.com/subdir2', uri.to_s)
28
+
29
+ uri = TheFox::Sengi::Uri.new('http://example.com/subdir2.html')
30
+ assert_equal('http://example.com/subdir2.html', uri.to_s)
31
+
32
+ uri = TheFox::Sengi::Uri.new('/subdir2.html')
33
+ assert_equal('/subdir2.html', uri.to_s)
34
+ end
35
+
36
+ def test_hash
37
+ uri = TheFox::Sengi::Uri.new('http://www.example.com/index.html')
38
+
39
+ assert_equal('b1ae8ba07f44d280254af4d1db914de03ce87b027e1c291ffcb9211c7712c9d1', uri.to_hash)
40
+ end
41
+
42
+ def test_id
43
+ uri = TheFox::Sengi::Uri.new('http://www.example.com/index.html')
44
+
45
+ uri.id = 21
46
+ assert_equal(21, uri.id)
47
+ assert_equal('urls:21', uri.key_name)
48
+
49
+ uri.id = 24
50
+ assert_equal(24, uri.id)
51
+ assert_equal('urls:24', uri.key_name)
52
+ end
53
+
54
+ def test_valid
55
+ uri = TheFox::Sengi::Uri.new('http://example.com')
56
+ assert_equal(true, uri.is_valid?)
57
+
58
+ uri = TheFox::Sengi::Uri.new('javascript:alert(1);')
59
+ assert_equal(false, uri.is_valid?)
60
+
61
+ uri = TheFox::Sengi::Uri.new('tel:+43501234567890')
62
+ assert_equal(false, uri.is_valid?)
63
+ end
64
+
65
+ def test_to_http
66
+ uri_http = TheFox::Sengi::Uri.new('http://www.example.com/index.html')
67
+ uri_https = TheFox::Sengi::Uri.new('https://www.example.com/index.html')
68
+ assert_equal(uri_http.to_http.to_s, uri_https.to_http.to_s)
69
+ end
70
+
71
+ def test_weight
72
+ uri = TheFox::Sengi::Uri.new('http://www.example1.com/index.html')
73
+
74
+ suburi = TheFox::Sengi::Uri.new('test.html')
75
+ assert_equal(100, suburi.weight(uri))
76
+
77
+ suburi = TheFox::Sengi::Uri.new('http://sub.www.example1.com')
78
+ assert_equal(200, suburi.weight(uri))
79
+
80
+ suburi = TheFox::Sengi::Uri.new('http://sub.example1.com')
81
+ assert_equal(250, suburi.weight(uri))
82
+
83
+ suburi = TheFox::Sengi::Uri.new('http://www.example2.com')
84
+ assert_equal(250, suburi.weight(uri))
85
+
86
+ suburi = TheFox::Sengi::Uri.new('https://www.example2.com')
87
+ assert_equal(290, suburi.weight(uri))
88
+ end
89
+
90
+ def test_join
91
+ uri1 = TheFox::Sengi::Uri.new('http://www.example.com')
92
+ uri2 = TheFox::Sengi::Uri.new('index.html')
93
+ uri3 = uri1.join(uri2)
94
+ assert_equal('http://www.example.com/', uri1.to_s)
95
+ assert_equal('index.html', uri2.to_s)
96
+ assert_equal('http://www.example.com/index.html', uri3.to_s)
97
+
98
+ uri1 = TheFox::Sengi::Uri.new('http://www.example.com/test1')
99
+ uri2 = TheFox::Sengi::Uri.new('../test2.html')
100
+ uri3 = uri1.join(uri2)
101
+ assert_equal('http://www.example.com/test1', uri1.to_s)
102
+ assert_equal('../test2.html', uri2.to_s)
103
+ assert_equal('http://www.example.com/test2.html', uri3.to_s)
104
+
105
+ uri1 = TheFox::Sengi::Uri.new('http://www.example1.com/test1.html')
106
+ uri2 = TheFox::Sengi::Uri.new('http://www.example2.com/test2.html')
107
+ uri3 = uri1.join(uri2)
108
+ assert_equal('http://www.example1.com/test1.html', uri1.to_s)
109
+ assert_equal('http://www.example2.com/test2.html', uri2.to_s)
110
+ assert_equal('http://www.example2.com/test2.html', uri3.to_s)
111
+ end
112
+
113
+ def test_is_relative
114
+ uri1 = TheFox::Sengi::Uri.new('index1.html')
115
+ uri2 = TheFox::Sengi::Uri.new('index2.html')
116
+ assert_equal(true, uri2.is_relative?(uri1))
117
+
118
+ uri1 = TheFox::Sengi::Uri.new('http://www.example.com')
119
+ uri2 = TheFox::Sengi::Uri.new('index.html')
120
+ assert_equal(true, uri2.is_relative?(uri1))
121
+
122
+ uri1 = TheFox::Sengi::Uri.new('http://www.example1.com/index.html')
123
+ uri2 = TheFox::Sengi::Uri.new('http://www.example2.com/index.html')
124
+ assert_equal(false, uri2.is_relative?(uri1))
125
+ end
126
+
127
+ def test_host_downcase
128
+ uri = TheFox::Sengi::Uri.new('http://www.EXAMPLE.com/Index.html')
129
+
130
+ assert_equal('http://www.example.com/Index.html', uri.to_s)
131
+ end
132
+
133
+ def test_fragment
134
+ uri = TheFox::Sengi::Uri.new('http://example.com/index.html#test')
135
+ assert_equal('http://example.com/index.html', uri.to_s)
136
+
137
+ uri = TheFox::Sengi::Uri.new('index.html#test')
138
+ assert_equal('index.html', uri.to_s)
139
+ end
140
+ end