kudzu-adapter-active_record 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.md +2 -2
- data/lib/generators/kudzu/adapter/active_record/migration_generator.rb +1 -1
- data/lib/generators/kudzu/adapter/active_record/templates/create_kudzu_chunks.rb.erb +9 -0
- data/lib/generators/kudzu/adapter/active_record/templates/create_kudzu_pages.rb.erb +0 -2
- data/lib/kudzu/adapter/active_record/all.rb +1 -5
- data/lib/kudzu/adapter/active_record/frontier.rb +1 -1
- data/lib/kudzu/adapter/active_record/model/chunk.rb +13 -0
- data/lib/kudzu/adapter/active_record/model/link.rb +3 -1
- data/lib/kudzu/adapter/active_record/model/page.rb +8 -2
- data/lib/kudzu/adapter/active_record/railtie.rb +18 -0
- data/lib/kudzu/adapter/active_record/repository.rb +30 -5
- data/lib/kudzu/adapter/active_record/version.rb +1 -1
- data/lib/kudzu/adapter/active_record.rb +17 -8
- metadata +17 -17
- data/lib/generators/kudzu/adapter/active_record/templates/create_kudzu_contents.rb.erb +0 -9
- data/lib/kudzu/adapter/active_record/model/content.rb +0 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 6070cb29d79bb57e5dd856f975ad58ae27d04ec1d5155a8ff93c65530ea3032c
|
4
|
+
data.tar.gz: a4f62c4f71e36dae0d5d9061443a606e54a3954220d0f94b059b9e6d38dd8978
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9ebf5a08ea9fc737fd342b4eadf443c01caafb0e14a43f78a60aac3cd0dc8a269d63635aee857fa59674ab20b7a41a464a486876e8b5a19c730e328af68a1c31
|
7
|
+
data.tar.gz: 2257f8e31654c5512a1b217b5edf9a657440ec73b2b909ac67b717b9cc7c3ebbf81166e7921c1536c3ca2f00c77601a3d8951a18a9789b36fc4213b5cc6592ac
|
data/README.md
CHANGED
@@ -4,7 +4,7 @@ ActiveRecord adapter for kudzu crawler.
|
|
4
4
|
|
5
5
|
## Dependencies
|
6
6
|
|
7
|
-
* kudzu 1.
|
7
|
+
* kudzu 1.1+
|
8
8
|
* activerecord 5.0+
|
9
9
|
|
10
10
|
## Installation
|
@@ -30,7 +30,7 @@ Migrate into your application:
|
|
30
30
|
This migration creates following tables:
|
31
31
|
|
32
32
|
* kudzu_pages
|
33
|
-
*
|
33
|
+
* kudzu_chunks
|
34
34
|
* kudzu_links
|
35
35
|
|
36
36
|
## Usage
|
@@ -9,7 +9,7 @@ module Kudzu
|
|
9
9
|
def create
|
10
10
|
@migration_version = migration_version
|
11
11
|
timestamp = Time.now.utc.strftime("%Y%m%d%H%M%S").to_i
|
12
|
-
["create_kudzu_pages", "
|
12
|
+
["create_kudzu_pages", "create_kudzu_chunks", "create_kudzu_links"].each_with_index do |filename, i|
|
13
13
|
timestamp += i
|
14
14
|
template "#{filename}.rb.erb", "db/migrate/#{timestamp}_#{filename}.rb"
|
15
15
|
end
|
@@ -13,8 +13,6 @@ class CreateKudzuPages < ActiveRecord::Migration<%= @migration_version %>
|
|
13
13
|
t.text :redirect_from
|
14
14
|
t.datetime :fetched_at
|
15
15
|
t.datetime :revised_at
|
16
|
-
t.integer :revisit_interval
|
17
|
-
t.datetime :revisit_at
|
18
16
|
t.timestamps null: false
|
19
17
|
|
20
18
|
t.index :url, length: 32
|
@@ -1,10 +1,6 @@
|
|
1
1
|
require_relative 'model/base'
|
2
2
|
require_relative 'model/page'
|
3
|
-
require_relative 'model/
|
3
|
+
require_relative 'model/chunk'
|
4
4
|
require_relative 'model/link'
|
5
5
|
require_relative 'frontier'
|
6
6
|
require_relative 'repository'
|
7
|
-
|
8
|
-
Kudzu::Page = Kudzu::Adapter::ActiveRecord::Page
|
9
|
-
Kudzu::Link = Kudzu::Adapter::ActiveRecord::Link
|
10
|
-
Kudzu::Content = Kudzu::Adapter::ActiveRecord::Content
|
@@ -2,9 +2,9 @@ module Kudzu
|
|
2
2
|
module Adapter
|
3
3
|
module ActiveRecord
|
4
4
|
class Page < Base
|
5
|
-
include Kudzu::
|
5
|
+
include Kudzu::Model::Page
|
6
6
|
|
7
|
-
|
7
|
+
has_many :chunks, -> { order(id: :asc) }, dependent: :delete_all
|
8
8
|
|
9
9
|
def response_header
|
10
10
|
if response_header_column_is_text?
|
@@ -22,6 +22,10 @@ module Kudzu
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
+
def data
|
26
|
+
chunks.pluck(:data).join
|
27
|
+
end
|
28
|
+
|
25
29
|
private
|
26
30
|
|
27
31
|
def response_header_column_is_text?
|
@@ -31,4 +35,6 @@ module Kudzu
|
|
31
35
|
end
|
32
36
|
end
|
33
37
|
end
|
38
|
+
|
39
|
+
Page = Adapter::ActiveRecord::Page
|
34
40
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module ActiveRecord
|
4
|
+
class Railtie < Rails::Railtie
|
5
|
+
ActiveSupport.on_load :active_record do
|
6
|
+
require_relative 'all'
|
7
|
+
end
|
8
|
+
|
9
|
+
config.after_initialize do
|
10
|
+
Dir.glob(Rails.root + 'app/decorators/kudzu/**/*_decorator*.rb').each do |c|
|
11
|
+
require_dependency(c)
|
12
|
+
end
|
13
|
+
Kudzu.logger = Rails.logger
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -3,20 +3,45 @@ module Kudzu
|
|
3
3
|
module ActiveRecord
|
4
4
|
class Repository
|
5
5
|
def find_by_url(url)
|
6
|
-
Page.
|
6
|
+
Page.find_or_initialize_by(url: url)
|
7
7
|
end
|
8
8
|
|
9
9
|
def register(page)
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
ActiveRecord::Base.transaction do
|
11
|
+
page.save
|
12
|
+
save_chunks(page) if page.body
|
13
13
|
end
|
14
|
-
page.save
|
15
14
|
end
|
16
15
|
|
17
16
|
def delete(page)
|
18
17
|
page.destroy if page
|
19
18
|
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def save_chunks(page)
|
23
|
+
chunk_num = 0
|
24
|
+
each_chunk(page.body, ActiveRecord.chunk_size) do |chunked, i|
|
25
|
+
chunk = page.chunks.select_without_data.offset(i).limit(1).first_or_initialize
|
26
|
+
chunk.data = chunked
|
27
|
+
chunk.save
|
28
|
+
chunk_num = i + 1
|
29
|
+
end
|
30
|
+
|
31
|
+
if page.chunks.count - chunk_num > 0
|
32
|
+
page.chunks.select_without_data.offset(chunk_num).each(&:delete)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def each_chunk(body, size)
|
37
|
+
pos = 0
|
38
|
+
i = 0
|
39
|
+
while (chunked = body.byteslice(pos, size))
|
40
|
+
yield chunked, i
|
41
|
+
pos += size
|
42
|
+
i += 1
|
43
|
+
end
|
44
|
+
end
|
20
45
|
end
|
21
46
|
end
|
22
47
|
end
|
@@ -1,19 +1,28 @@
|
|
1
1
|
require 'activerecord-import'
|
2
2
|
require 'kudzu'
|
3
3
|
|
4
|
+
if defined? Rails
|
5
|
+
require_relative 'active_record/railtie'
|
6
|
+
else
|
7
|
+
require_relative 'active_record/all'
|
8
|
+
end
|
9
|
+
|
4
10
|
module Kudzu
|
5
11
|
module Adapter
|
6
12
|
module ActiveRecord
|
13
|
+
class << self
|
14
|
+
@@chunk_size = 5*(1024**2)
|
15
|
+
|
16
|
+
def chunk_size
|
17
|
+
@@chunk_size
|
18
|
+
end
|
19
|
+
|
20
|
+
def chunk_size=(val)
|
21
|
+
@@chunk_size = val
|
22
|
+
end
|
23
|
+
end
|
7
24
|
end
|
8
25
|
end
|
9
26
|
end
|
10
27
|
|
11
28
|
Kudzu.adapter = Kudzu::Adapter::ActiveRecord
|
12
|
-
|
13
|
-
if defined? Railtie
|
14
|
-
ActiveSupport.on_load :active_record do
|
15
|
-
require_relative 'active_record/all'
|
16
|
-
end
|
17
|
-
else
|
18
|
-
require_relative 'active_record/all'
|
19
|
-
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kudzu-adapter-active_record
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoshikazu Kaneta
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: kudzu
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1.
|
19
|
+
version: '1.1'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '1.
|
26
|
+
version: '1.1'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: activerecord
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -67,7 +67,7 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: webrick
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ">="
|
@@ -81,7 +81,7 @@ dependencies:
|
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
83
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
84
|
+
name: sqlite3
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - ">="
|
@@ -95,7 +95,7 @@ dependencies:
|
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
97
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
98
|
+
name: mysql2
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
100
100
|
requirements:
|
101
101
|
- - ">="
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: '0'
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
|
-
name:
|
112
|
+
name: pg
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
114
114
|
requirements:
|
115
115
|
- - ">="
|
@@ -123,7 +123,7 @@ dependencies:
|
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
126
|
+
name: rspec-rails
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
128
128
|
requirements:
|
129
129
|
- - ">="
|
@@ -137,7 +137,7 @@ dependencies:
|
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: '0'
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
|
-
name:
|
140
|
+
name: simplecov
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
142
142
|
requirements:
|
143
143
|
- - ">="
|
@@ -160,23 +160,24 @@ files:
|
|
160
160
|
- README.md
|
161
161
|
- Rakefile
|
162
162
|
- lib/generators/kudzu/adapter/active_record/migration_generator.rb
|
163
|
-
- lib/generators/kudzu/adapter/active_record/templates/
|
163
|
+
- lib/generators/kudzu/adapter/active_record/templates/create_kudzu_chunks.rb.erb
|
164
164
|
- lib/generators/kudzu/adapter/active_record/templates/create_kudzu_links.rb.erb
|
165
165
|
- lib/generators/kudzu/adapter/active_record/templates/create_kudzu_pages.rb.erb
|
166
166
|
- lib/kudzu/adapter/active_record.rb
|
167
167
|
- lib/kudzu/adapter/active_record/all.rb
|
168
168
|
- lib/kudzu/adapter/active_record/frontier.rb
|
169
169
|
- lib/kudzu/adapter/active_record/model/base.rb
|
170
|
-
- lib/kudzu/adapter/active_record/model/
|
170
|
+
- lib/kudzu/adapter/active_record/model/chunk.rb
|
171
171
|
- lib/kudzu/adapter/active_record/model/link.rb
|
172
172
|
- lib/kudzu/adapter/active_record/model/page.rb
|
173
|
+
- lib/kudzu/adapter/active_record/railtie.rb
|
173
174
|
- lib/kudzu/adapter/active_record/repository.rb
|
174
175
|
- lib/kudzu/adapter/active_record/version.rb
|
175
176
|
homepage: https://github.com/kanety/kudzu-adapter-active_record
|
176
177
|
licenses:
|
177
178
|
- MIT
|
178
179
|
metadata: {}
|
179
|
-
post_install_message:
|
180
|
+
post_install_message:
|
180
181
|
rdoc_options: []
|
181
182
|
require_paths:
|
182
183
|
- lib
|
@@ -191,9 +192,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
191
192
|
- !ruby/object:Gem::Version
|
192
193
|
version: '0'
|
193
194
|
requirements: []
|
194
|
-
|
195
|
-
|
196
|
-
signing_key:
|
195
|
+
rubygems_version: 3.3.3
|
196
|
+
signing_key:
|
197
197
|
specification_version: 4
|
198
198
|
summary: ActiveRecord adapter for kudzu crawler
|
199
199
|
test_files: []
|