kudzu-adapter-active_record 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +46 -0
- data/Rakefile +6 -0
- data/lib/generators/kudzu/adapter/active_record/migration_generator.rb +28 -0
- data/lib/generators/kudzu/adapter/active_record/templates/create_kudzu_contents.rb.erb +9 -0
- data/lib/generators/kudzu/adapter/active_record/templates/create_kudzu_links.rb.erb +16 -0
- data/lib/generators/kudzu/adapter/active_record/templates/create_kudzu_pages.rb.erb +24 -0
- data/lib/kudzu/adapter/active_record.rb +19 -0
- data/lib/kudzu/adapter/active_record/all.rb +10 -0
- data/lib/kudzu/adapter/active_record/frontier.rb +43 -0
- data/lib/kudzu/adapter/active_record/model/base.rb +15 -0
- data/lib/kudzu/adapter/active_record/model/content.rb +9 -0
- data/lib/kudzu/adapter/active_record/model/link.rb +9 -0
- data/lib/kudzu/adapter/active_record/model/page.rb +34 -0
- data/lib/kudzu/adapter/active_record/repository.rb +23 -0
- data/lib/kudzu/adapter/active_record/version.rb +7 -0
- metadata +199 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b41da782783d660d5366be6d2c235d50722f8574
|
4
|
+
data.tar.gz: f41f69bf3555773358301b07e9b6832c36424a79
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1e3f5cc57bb822445c288c3a999614a18811ae3e835205b44295c8ac38b804327254436bb85236545822de1e097e20af35971dccb8cfd7bc09ffcf80f3ff86a5
|
7
|
+
data.tar.gz: d8626598ce2b61a69eb75f3d897c0b881847a2745bd36db63cb2443b6e5f24300c6e08562c219556c824c932f548ca65fece1303cf4be581771c71aa528eda26
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# Kudzu::Adapter::ActiveRecord
|
2
|
+
|
3
|
+
ActiveRecord adapter for kudzu crawler.
|
4
|
+
|
5
|
+
## Dependencies
|
6
|
+
|
7
|
+
* kudzu 1.0+
|
8
|
+
* activerecord 5.0+
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
Add to your application's Gemfile:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
gem 'kudzu-adapter-active_record'
|
16
|
+
```
|
17
|
+
|
18
|
+
Then run:
|
19
|
+
|
20
|
+
$ bundle install
|
21
|
+
|
22
|
+
Create migration files:
|
23
|
+
|
24
|
+
$ rails generate kudzu:adapter:active_record:migration
|
25
|
+
|
26
|
+
Migrate into your application:
|
27
|
+
|
28
|
+
$ bundle exec rake db:migrate
|
29
|
+
|
30
|
+
This migration creates following tables:
|
31
|
+
|
32
|
+
* kudzu_pages
|
33
|
+
* kudzu_contents
|
34
|
+
* kudzu_links
|
35
|
+
|
36
|
+
## Usage
|
37
|
+
|
38
|
+
Basic usage is same as [kudzu gem](https://github.com/kanety/kudzu).
|
39
|
+
|
40
|
+
## Contributing
|
41
|
+
|
42
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/kanety/kudzu-adapter-active_record. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
43
|
+
|
44
|
+
## License
|
45
|
+
|
46
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'rails/generators'
|
2
|
+
|
3
|
+
module Kudzu
|
4
|
+
module Adapter
|
5
|
+
module ActiveRecord
|
6
|
+
class MigrationGenerator < Rails::Generators::Base
|
7
|
+
source_root File.join(File.dirname(__FILE__), 'templates')
|
8
|
+
|
9
|
+
def create
|
10
|
+
@migration_version = migration_version
|
11
|
+
timestamp = Time.now.utc.strftime("%Y%m%d%H%M%S").to_i
|
12
|
+
["create_kudzu_pages", "create_kudzu_contents", "create_kudzu_links"].each_with_index do |filename, i|
|
13
|
+
timestamp += i
|
14
|
+
template "#{filename}.rb.erb", "db/migrate/#{timestamp}_#{filename}.rb"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def migration_version
|
21
|
+
if ::ActiveRecord::VERSION::MAJOR >= 5
|
22
|
+
"[#{::ActiveRecord::VERSION::MAJOR}.#{::ActiveRecord::VERSION::MINOR}]"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class CreateKudzuLinks < ActiveRecord::Migration<%= @migration_version %>
|
2
|
+
def change
|
3
|
+
create_table :kudzu_links do |t|
|
4
|
+
t.text :uuid
|
5
|
+
t.text :url
|
6
|
+
t.text :title
|
7
|
+
t.integer :state
|
8
|
+
t.integer :depth
|
9
|
+
t.timestamps null: false
|
10
|
+
|
11
|
+
t.index :uuid, length: 4
|
12
|
+
t.index :url, length: 32
|
13
|
+
t.index :state
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class CreateKudzuPages < ActiveRecord::Migration<%= @migration_version %>
|
2
|
+
def change
|
3
|
+
create_table :kudzu_pages do |t|
|
4
|
+
t.text :url
|
5
|
+
t.text :title
|
6
|
+
t.integer :status
|
7
|
+
t.text :mime_type
|
8
|
+
t.integer :size
|
9
|
+
t.text :charset
|
10
|
+
t.text :digest
|
11
|
+
t.text :response_header
|
12
|
+
t.float :response_time
|
13
|
+
t.text :redirect_from
|
14
|
+
t.datetime :fetched_at
|
15
|
+
t.datetime :revised_at
|
16
|
+
t.integer :revisit_interval
|
17
|
+
t.datetime :revisit_at
|
18
|
+
t.timestamps null: false
|
19
|
+
|
20
|
+
t.index :url, length: 32
|
21
|
+
t.index :digest, length: 6
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'activerecord-import'
|
2
|
+
require 'kudzu'
|
3
|
+
|
4
|
+
module Kudzu
|
5
|
+
module Adapter
|
6
|
+
module ActiveRecord
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
Kudzu.adapter = Kudzu::Adapter::ActiveRecord
|
12
|
+
|
13
|
+
if defined? Railtie
|
14
|
+
ActiveSupport.on_load :active_record do
|
15
|
+
require_relative 'active_record/all'
|
16
|
+
end
|
17
|
+
else
|
18
|
+
require_relative 'active_record/all'
|
19
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require_relative 'model/base'
|
2
|
+
require_relative 'model/page'
|
3
|
+
require_relative 'model/content'
|
4
|
+
require_relative 'model/link'
|
5
|
+
require_relative 'frontier'
|
6
|
+
require_relative 'repository'
|
7
|
+
|
8
|
+
Kudzu::Page = Kudzu::Adapter::ActiveRecord::Page
|
9
|
+
Kudzu::Link = Kudzu::Adapter::ActiveRecord::Link
|
10
|
+
Kudzu::Content = Kudzu::Adapter::ActiveRecord::Content
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module ActiveRecord
|
4
|
+
class Frontier
|
5
|
+
def initialize(uuid, config = {})
|
6
|
+
@uuid = uuid
|
7
|
+
@monitor = Monitor.new
|
8
|
+
end
|
9
|
+
|
10
|
+
def enqueue(links, depth: 0)
|
11
|
+
@monitor.synchronize do
|
12
|
+
links = filter_existing_urls(links)
|
13
|
+
Link.import(links)
|
14
|
+
links
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def dequeue(limit: 1)
|
19
|
+
@monitor.synchronize do
|
20
|
+
links = Link.where(uuid: @uuid, state: 0).order(id: :asc).limit(limit).to_a
|
21
|
+
links.each do |link|
|
22
|
+
link.state = 1
|
23
|
+
link.save
|
24
|
+
end
|
25
|
+
links
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def clear
|
30
|
+
Link.where(uuid: @uuid).delete_all
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def filter_existing_urls(links)
|
36
|
+
urls = Array(links).map { |link| link.url }
|
37
|
+
existed_urls = Link.where(uuid: @uuid, url: urls).pluck(:url)
|
38
|
+
links.select { |link| !existed_urls.include?(link.url) }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module ActiveRecord
|
4
|
+
class Page < Base
|
5
|
+
include Kudzu::Adapter::Base::Page
|
6
|
+
|
7
|
+
has_one :content, dependent: :destroy
|
8
|
+
|
9
|
+
def response_header
|
10
|
+
if response_header_column_is_text?
|
11
|
+
JSON.parse(super)
|
12
|
+
else
|
13
|
+
super
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def response_header=(val)
|
18
|
+
if response_header_column_is_text?
|
19
|
+
super(JSON.generate(val))
|
20
|
+
else
|
21
|
+
super
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def response_header_column_is_text?
|
28
|
+
type = self.class.columns_hash["response_header"].type
|
29
|
+
type == :text || type == :string
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module ActiveRecord
|
4
|
+
class Repository
|
5
|
+
def find_by_url(url)
|
6
|
+
Page.where(url: url).first_or_initialize
|
7
|
+
end
|
8
|
+
|
9
|
+
def register(page)
|
10
|
+
if page.body
|
11
|
+
content = page.content || page.build_content
|
12
|
+
content.data = page.body
|
13
|
+
end
|
14
|
+
page.save
|
15
|
+
end
|
16
|
+
|
17
|
+
def delete(page)
|
18
|
+
page.destroy if page
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: kudzu-adapter-active_record
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Yoshikazu Kaneta
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2017-12-20 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: kudzu
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: activerecord
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: activerecord-import
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rails
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: sqlite3
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rspec-rails
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: simplecov
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: pry-rails
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: pry-byebug
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: database_cleaner
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
description: ActiveRecord adapter for kudzu crawler
|
154
|
+
email:
|
155
|
+
- kaneta@sitebridge.co.jp
|
156
|
+
executables: []
|
157
|
+
extensions: []
|
158
|
+
extra_rdoc_files: []
|
159
|
+
files:
|
160
|
+
- README.md
|
161
|
+
- Rakefile
|
162
|
+
- lib/generators/kudzu/adapter/active_record/migration_generator.rb
|
163
|
+
- lib/generators/kudzu/adapter/active_record/templates/create_kudzu_contents.rb.erb
|
164
|
+
- lib/generators/kudzu/adapter/active_record/templates/create_kudzu_links.rb.erb
|
165
|
+
- lib/generators/kudzu/adapter/active_record/templates/create_kudzu_pages.rb.erb
|
166
|
+
- lib/kudzu/adapter/active_record.rb
|
167
|
+
- lib/kudzu/adapter/active_record/all.rb
|
168
|
+
- lib/kudzu/adapter/active_record/frontier.rb
|
169
|
+
- lib/kudzu/adapter/active_record/model/base.rb
|
170
|
+
- lib/kudzu/adapter/active_record/model/content.rb
|
171
|
+
- lib/kudzu/adapter/active_record/model/link.rb
|
172
|
+
- lib/kudzu/adapter/active_record/model/page.rb
|
173
|
+
- lib/kudzu/adapter/active_record/repository.rb
|
174
|
+
- lib/kudzu/adapter/active_record/version.rb
|
175
|
+
homepage: https://github.com/kanety/kudzu-adapter-active_record
|
176
|
+
licenses:
|
177
|
+
- MIT
|
178
|
+
metadata: {}
|
179
|
+
post_install_message:
|
180
|
+
rdoc_options: []
|
181
|
+
require_paths:
|
182
|
+
- lib
|
183
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
184
|
+
requirements:
|
185
|
+
- - ">="
|
186
|
+
- !ruby/object:Gem::Version
|
187
|
+
version: '0'
|
188
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
189
|
+
requirements:
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: '0'
|
193
|
+
requirements: []
|
194
|
+
rubyforge_project:
|
195
|
+
rubygems_version: 2.5.2.2
|
196
|
+
signing_key:
|
197
|
+
specification_version: 4
|
198
|
+
summary: ActiveRecord adapter for kudzu crawler
|
199
|
+
test_files: []
|