kudzu 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ module Kudzu
2
+ class Logger
3
+ def initialize(file, level)
4
+ if file.is_a?(::Logger)
5
+ @logger = file
6
+ elsif file
7
+ @logger = ::Logger.new(file)
8
+ @logger.level = level
9
+ else
10
+ @logger = nil
11
+ end
12
+ end
13
+
14
+ def log(level, message, error: nil)
15
+ return unless @logger
16
+ message += " #{error.class} #{error.message} #{error.backtrace.join("\n")}" if error
17
+ @logger.send(level, message)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ Dir[File.join(__dir__, '*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -0,0 +1,28 @@
1
+ module Kudzu
2
+ class Revisit
3
+ class Scheduler
4
+ def initialize(config)
5
+ @config = config
6
+ end
7
+
8
+ def schedule(page, modified: true)
9
+ page.revisit_interval = next_interval(page.revisit_interval, modified)
10
+ page.revisit_at = page.fetched_at + page.revisit_interval * 86400
11
+ end
12
+
13
+ private
14
+
15
+ def next_interval(curr_interval, modified)
16
+ if curr_interval
17
+ if modified
18
+ [curr_interval - 1, @config.revisit_min_interval].max
19
+ else
20
+ [curr_interval + 1, @config.revisit_max_interval].min
21
+ end
22
+ else
23
+ @config.revisit_default_interval
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,3 @@
1
+ Dir[File.join(__dir__, '*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -0,0 +1,56 @@
1
+ module Kudzu
2
+ class Util
3
+ class ConnectionPool
4
+ class Connection
5
+ attr_accessor :name, :http, :last_used
6
+
7
+ def initialize(attr = {})
8
+ attr.each { |k, v| public_send("#{k}=", v) }
9
+ end
10
+ end
11
+
12
+ def initialize(max_size = 10)
13
+ @max_size = max_size
14
+ end
15
+
16
+ def checkout(name)
17
+ pool[name] ||= Connection.new(name: name, http: yield)
18
+
19
+ conn = pool[name]
20
+ conn.last_used = Time.now
21
+
22
+ if pool.size > @max_size
23
+ reduce
24
+ end
25
+
26
+ conn.http
27
+ end
28
+
29
+ def close
30
+ pool.values.each do |conn|
31
+ finish_http(conn.http)
32
+ end
33
+ Thread.current[:kudzu_connection] = nil
34
+ end
35
+
36
+ private
37
+
38
+ def pool
39
+ Thread.current[:kudzu_connection] ||= {}
40
+ Thread.current[:kudzu_connection]
41
+ end
42
+
43
+ def reduce
44
+ conns = pool.values.sort_by { |conn| conn.last_used }
45
+ conns.first(pool.size - @max_size).each do |conn|
46
+ finish_http(conn.http)
47
+ pool.delete(conn.name)
48
+ end
49
+ end
50
+
51
+ def finish_http(http)
52
+ http.finish if http && http.started?
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,24 @@
1
+ module Kudzu
2
+ class Util
3
+ class ContentTypeParser
4
+ def parse(content_type)
5
+ mime, *kvs = content_type.to_s.split(';').map { |str| str.strip.downcase }
6
+ params = kvs.each_with_object({}) do |kv, hash|
7
+ k, v = kv.to_s.split('=').map { |str| str.strip }
8
+ hash[k.to_sym] = unquote(v) if k && v
9
+ end
10
+ return mime, params
11
+ end
12
+
13
+ private
14
+
15
+ def unquote(str)
16
+ if str =~ /^"(.*?)"$/
17
+ $1.gsub(/\\(.)/, '\1')
18
+ else
19
+ str
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,21 @@
1
+ module Kudzu
2
+ class Util
3
+ class Matcher
4
+ def match?(text, allows: nil, denies: nil)
5
+ match_to_allows?(text, allows) && !match_to_denies?(text, denies)
6
+ end
7
+
8
+ private
9
+
10
+ def match_to_allows?(text, allows)
11
+ allows = Array(allows)
12
+ allows.empty? || allows.any? { |allow| Kudzu::Common.match?(text, allow) }
13
+ end
14
+
15
+ def match_to_denies?(text, denies)
16
+ denies = Array(denies)
17
+ !denies.empty? && denies.any? { |deny| Kudzu::Common.match?(text, deny) }
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,38 @@
1
+ module Kudzu
2
+ class Util
3
+ class ThreadPool
4
+ def initialize(size)
5
+ @size = size
6
+ @queue = Queue.new
7
+ @threads = []
8
+ end
9
+
10
+ def start(&block)
11
+ @threads = 1.upto(@size).map { create_thread(&block) }
12
+ end
13
+
14
+ def wait
15
+ until @queue.num_waiting == @threads.select { |t| t.alive? }.size
16
+ Thread.pass
17
+ sleep 1
18
+ end
19
+ end
20
+
21
+ def shutdown
22
+ @threads.each { |t| t.kill }
23
+ @threads = []
24
+ end
25
+
26
+ private
27
+
28
+ def create_thread(&block)
29
+ Thread.start do
30
+ loop do
31
+ ret = block.call(@queue)
32
+ break if ret == :end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,3 @@
1
+ module Kudzu
2
+ VERSION = '1.0.0'
3
+ end
metadata ADDED
@@ -0,0 +1,234 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kudzu
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Yoshikazu Kaneta
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-12-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: addressable
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: charlock_holmes
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: shared-mime-info
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: mime-types
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: http-cookie
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rails
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rspec-rails
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: simplecov
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: pry-rails
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: pry-byebug
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ description: A simple web crawler for ruby
168
+ email:
169
+ - kaneta@sitebridge.co.jp
170
+ executables: []
171
+ extensions: []
172
+ extra_rdoc_files: []
173
+ files:
174
+ - README.md
175
+ - Rakefile
176
+ - lib/kudzu.rb
177
+ - lib/kudzu/adapter/base/all.rb
178
+ - lib/kudzu/adapter/base/link.rb
179
+ - lib/kudzu/adapter/base/page.rb
180
+ - lib/kudzu/adapter/memory.rb
181
+ - lib/kudzu/adapter/memory/all.rb
182
+ - lib/kudzu/adapter/memory/frontier.rb
183
+ - lib/kudzu/adapter/memory/model/link.rb
184
+ - lib/kudzu/adapter/memory/model/page.rb
185
+ - lib/kudzu/adapter/memory/repository.rb
186
+ - lib/kudzu/agent/all.rb
187
+ - lib/kudzu/agent/charset_detector.rb
188
+ - lib/kudzu/agent/fetcher.rb
189
+ - lib/kudzu/agent/filter.rb
190
+ - lib/kudzu/agent/mime_type_detector.rb
191
+ - lib/kudzu/agent/robots.rb
192
+ - lib/kudzu/agent/sleeper.rb
193
+ - lib/kudzu/agent/title_parser.rb
194
+ - lib/kudzu/agent/url_extractor.rb
195
+ - lib/kudzu/agent/url_filter.rb
196
+ - lib/kudzu/callback.rb
197
+ - lib/kudzu/common.rb
198
+ - lib/kudzu/config.rb
199
+ - lib/kudzu/config/filter.rb
200
+ - lib/kudzu/crawler.rb
201
+ - lib/kudzu/logger.rb
202
+ - lib/kudzu/revisit/all.rb
203
+ - lib/kudzu/revisit/scheduler.rb
204
+ - lib/kudzu/util/all.rb
205
+ - lib/kudzu/util/connection_pool.rb
206
+ - lib/kudzu/util/content_type_parser.rb
207
+ - lib/kudzu/util/matcher.rb
208
+ - lib/kudzu/util/thread_pool.rb
209
+ - lib/kudzu/version.rb
210
+ homepage: https://github.com/kanety/kudzu
211
+ licenses:
212
+ - MIT
213
+ metadata: {}
214
+ post_install_message:
215
+ rdoc_options: []
216
+ require_paths:
217
+ - lib
218
+ required_ruby_version: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
223
+ required_rubygems_version: !ruby/object:Gem::Requirement
224
+ requirements:
225
+ - - ">="
226
+ - !ruby/object:Gem::Version
227
+ version: '0'
228
+ requirements: []
229
+ rubyforge_project:
230
+ rubygems_version: 2.5.2.2
231
+ signing_key:
232
+ specification_version: 4
233
+ summary: A simple web crawler for ruby
234
+ test_files: []