kudzu 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ module Kudzu
2
+ class Logger
3
+ def initialize(file, level)
4
+ if file.is_a?(::Logger)
5
+ @logger = file
6
+ elsif file
7
+ @logger = ::Logger.new(file)
8
+ @logger.level = level
9
+ else
10
+ @logger = nil
11
+ end
12
+ end
13
+
14
+ def log(level, message, error: nil)
15
+ return unless @logger
16
+ message += " #{error.class} #{error.message} #{error.backtrace.join("\n")}" if error
17
+ @logger.send(level, message)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ Dir[File.join(__dir__, '*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -0,0 +1,28 @@
1
+ module Kudzu
2
+ class Revisit
3
+ class Scheduler
4
+ def initialize(config)
5
+ @config = config
6
+ end
7
+
8
+ def schedule(page, modified: true)
9
+ page.revisit_interval = next_interval(page.revisit_interval, modified)
10
+ page.revisit_at = page.fetched_at + page.revisit_interval * 86400
11
+ end
12
+
13
+ private
14
+
15
+ def next_interval(curr_interval, modified)
16
+ if curr_interval
17
+ if modified
18
+ [curr_interval - 1, @config.revisit_min_interval].max
19
+ else
20
+ [curr_interval + 1, @config.revisit_max_interval].min
21
+ end
22
+ else
23
+ @config.revisit_default_interval
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,3 @@
1
+ Dir[File.join(__dir__, '*.rb')].each do |file|
2
+ require_relative file
3
+ end
@@ -0,0 +1,56 @@
1
+ module Kudzu
2
+ class Util
3
+ class ConnectionPool
4
+ class Connection
5
+ attr_accessor :name, :http, :last_used
6
+
7
+ def initialize(attr = {})
8
+ attr.each { |k, v| public_send("#{k}=", v) }
9
+ end
10
+ end
11
+
12
+ def initialize(max_size = 10)
13
+ @max_size = max_size
14
+ end
15
+
16
+ def checkout(name)
17
+ pool[name] ||= Connection.new(name: name, http: yield)
18
+
19
+ conn = pool[name]
20
+ conn.last_used = Time.now
21
+
22
+ if pool.size > @max_size
23
+ reduce
24
+ end
25
+
26
+ conn.http
27
+ end
28
+
29
+ def close
30
+ pool.values.each do |conn|
31
+ finish_http(conn.http)
32
+ end
33
+ Thread.current[:kudzu_connection] = nil
34
+ end
35
+
36
+ private
37
+
38
+ def pool
39
+ Thread.current[:kudzu_connection] ||= {}
40
+ Thread.current[:kudzu_connection]
41
+ end
42
+
43
+ def reduce
44
+ conns = pool.values.sort_by { |conn| conn.last_used }
45
+ conns.first(pool.size - @max_size).each do |conn|
46
+ finish_http(conn.http)
47
+ pool.delete(conn.name)
48
+ end
49
+ end
50
+
51
+ def finish_http(http)
52
+ http.finish if http && http.started?
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,24 @@
1
+ module Kudzu
2
+ class Util
3
+ class ContentTypeParser
4
+ def parse(content_type)
5
+ mime, *kvs = content_type.to_s.split(';').map { |str| str.strip.downcase }
6
+ params = kvs.each_with_object({}) do |kv, hash|
7
+ k, v = kv.to_s.split('=').map { |str| str.strip }
8
+ hash[k.to_sym] = unquote(v) if k && v
9
+ end
10
+ return mime, params
11
+ end
12
+
13
+ private
14
+
15
+ def unquote(str)
16
+ if str =~ /^"(.*?)"$/
17
+ $1.gsub(/\\(.)/, '\1')
18
+ else
19
+ str
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,21 @@
1
+ module Kudzu
2
+ class Util
3
+ class Matcher
4
+ def match?(text, allows: nil, denies: nil)
5
+ match_to_allows?(text, allows) && !match_to_denies?(text, denies)
6
+ end
7
+
8
+ private
9
+
10
+ def match_to_allows?(text, allows)
11
+ allows = Array(allows)
12
+ allows.empty? || allows.any? { |allow| Kudzu::Common.match?(text, allow) }
13
+ end
14
+
15
+ def match_to_denies?(text, denies)
16
+ denies = Array(denies)
17
+ !denies.empty? && denies.any? { |deny| Kudzu::Common.match?(text, deny) }
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,38 @@
1
+ module Kudzu
2
+ class Util
3
+ class ThreadPool
4
+ def initialize(size)
5
+ @size = size
6
+ @queue = Queue.new
7
+ @threads = []
8
+ end
9
+
10
+ def start(&block)
11
+ @threads = 1.upto(@size).map { create_thread(&block) }
12
+ end
13
+
14
+ def wait
15
+ until @queue.num_waiting == @threads.select { |t| t.alive? }.size
16
+ Thread.pass
17
+ sleep 1
18
+ end
19
+ end
20
+
21
+ def shutdown
22
+ @threads.each { |t| t.kill }
23
+ @threads = []
24
+ end
25
+
26
+ private
27
+
28
+ def create_thread(&block)
29
+ Thread.start do
30
+ loop do
31
+ ret = block.call(@queue)
32
+ break if ret == :end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,3 @@
1
+ module Kudzu
2
+ VERSION = '1.0.0'
3
+ end
metadata ADDED
@@ -0,0 +1,234 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kudzu
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Yoshikazu Kaneta
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-12-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: addressable
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: charlock_holmes
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: shared-mime-info
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: mime-types
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: http-cookie
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rails
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rspec-rails
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: simplecov
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: pry-rails
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ - !ruby/object:Gem::Dependency
154
+ name: pry-byebug
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :development
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
167
+ description: A simple web crawler for ruby
168
+ email:
169
+ - kaneta@sitebridge.co.jp
170
+ executables: []
171
+ extensions: []
172
+ extra_rdoc_files: []
173
+ files:
174
+ - README.md
175
+ - Rakefile
176
+ - lib/kudzu.rb
177
+ - lib/kudzu/adapter/base/all.rb
178
+ - lib/kudzu/adapter/base/link.rb
179
+ - lib/kudzu/adapter/base/page.rb
180
+ - lib/kudzu/adapter/memory.rb
181
+ - lib/kudzu/adapter/memory/all.rb
182
+ - lib/kudzu/adapter/memory/frontier.rb
183
+ - lib/kudzu/adapter/memory/model/link.rb
184
+ - lib/kudzu/adapter/memory/model/page.rb
185
+ - lib/kudzu/adapter/memory/repository.rb
186
+ - lib/kudzu/agent/all.rb
187
+ - lib/kudzu/agent/charset_detector.rb
188
+ - lib/kudzu/agent/fetcher.rb
189
+ - lib/kudzu/agent/filter.rb
190
+ - lib/kudzu/agent/mime_type_detector.rb
191
+ - lib/kudzu/agent/robots.rb
192
+ - lib/kudzu/agent/sleeper.rb
193
+ - lib/kudzu/agent/title_parser.rb
194
+ - lib/kudzu/agent/url_extractor.rb
195
+ - lib/kudzu/agent/url_filter.rb
196
+ - lib/kudzu/callback.rb
197
+ - lib/kudzu/common.rb
198
+ - lib/kudzu/config.rb
199
+ - lib/kudzu/config/filter.rb
200
+ - lib/kudzu/crawler.rb
201
+ - lib/kudzu/logger.rb
202
+ - lib/kudzu/revisit/all.rb
203
+ - lib/kudzu/revisit/scheduler.rb
204
+ - lib/kudzu/util/all.rb
205
+ - lib/kudzu/util/connection_pool.rb
206
+ - lib/kudzu/util/content_type_parser.rb
207
+ - lib/kudzu/util/matcher.rb
208
+ - lib/kudzu/util/thread_pool.rb
209
+ - lib/kudzu/version.rb
210
+ homepage: https://github.com/kanety/kudzu
211
+ licenses:
212
+ - MIT
213
+ metadata: {}
214
+ post_install_message:
215
+ rdoc_options: []
216
+ require_paths:
217
+ - lib
218
+ required_ruby_version: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
223
+ required_rubygems_version: !ruby/object:Gem::Requirement
224
+ requirements:
225
+ - - ">="
226
+ - !ruby/object:Gem::Version
227
+ version: '0'
228
+ requirements: []
229
+ rubyforge_project:
230
+ rubygems_version: 2.5.2.2
231
+ signing_key:
232
+ specification_version: 4
233
+ summary: A simple web crawler for ruby
234
+ test_files: []