kudzu-adapter-active_record 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +2 -2
- data/lib/generators/kudzu/adapter/active_record/migration_generator.rb +1 -1
- data/lib/generators/kudzu/adapter/active_record/templates/create_kudzu_chunks.rb.erb +9 -0
- data/lib/generators/kudzu/adapter/active_record/templates/create_kudzu_pages.rb.erb +0 -2
- data/lib/kudzu/adapter/active_record.rb +17 -8
- data/lib/kudzu/adapter/active_record/all.rb +1 -5
- data/lib/kudzu/adapter/active_record/model/chunk.rb +13 -0
- data/lib/kudzu/adapter/active_record/model/link.rb +3 -1
- data/lib/kudzu/adapter/active_record/model/page.rb +8 -2
- data/lib/kudzu/adapter/active_record/railtie.rb +18 -0
- data/lib/kudzu/adapter/active_record/repository.rb +30 -5
- data/lib/kudzu/adapter/active_record/version.rb +1 -1
- metadata +35 -6
- data/lib/generators/kudzu/adapter/active_record/templates/create_kudzu_contents.rb.erb +0 -9
- data/lib/kudzu/adapter/active_record/model/content.rb +0 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 72bd4ed5f1239eb982c765dae9fef82981e0dd9d
|
4
|
+
data.tar.gz: 4e81c7004721352b63e4f297f85c2947b7edd76a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6b2049943de8e5fd38c3d96681e33d1686c59b9908ba50748092c6935e88d8e10d47be18360cc7d5ee7fb9365cbe58dd7af714151eb106a2f56959b19be7342b
|
7
|
+
data.tar.gz: 94defaf96e652e733249da1372cd85a683d08214efd4307b88ec98302ddddbd1c93668e0b4c4e9e1ed65b1f1d42a96937b7cd656dad19e220a10f868d670b173
|
data/README.md
CHANGED
@@ -4,7 +4,7 @@ ActiveRecord adapter for kudzu crawler.
|
|
4
4
|
|
5
5
|
## Dependencies
|
6
6
|
|
7
|
-
* kudzu 1.
|
7
|
+
* kudzu 1.1+
|
8
8
|
* activerecord 5.0+
|
9
9
|
|
10
10
|
## Installation
|
@@ -30,7 +30,7 @@ Migrate into your application:
|
|
30
30
|
This migration creates following tables:
|
31
31
|
|
32
32
|
* kudzu_pages
|
33
|
-
*
|
33
|
+
* kudzu_chunks
|
34
34
|
* kudzu_links
|
35
35
|
|
36
36
|
## Usage
|
@@ -9,7 +9,7 @@ module Kudzu
|
|
9
9
|
def create
|
10
10
|
@migration_version = migration_version
|
11
11
|
timestamp = Time.now.utc.strftime("%Y%m%d%H%M%S").to_i
|
12
|
-
["create_kudzu_pages", "
|
12
|
+
["create_kudzu_pages", "create_kudzu_chunks", "create_kudzu_links"].each_with_index do |filename, i|
|
13
13
|
timestamp += i
|
14
14
|
template "#{filename}.rb.erb", "db/migrate/#{timestamp}_#{filename}.rb"
|
15
15
|
end
|
@@ -13,8 +13,6 @@ class CreateKudzuPages < ActiveRecord::Migration<%= @migration_version %>
|
|
13
13
|
t.text :redirect_from
|
14
14
|
t.datetime :fetched_at
|
15
15
|
t.datetime :revised_at
|
16
|
-
t.integer :revisit_interval
|
17
|
-
t.datetime :revisit_at
|
18
16
|
t.timestamps null: false
|
19
17
|
|
20
18
|
t.index :url, length: 32
|
@@ -1,19 +1,28 @@
|
|
1
1
|
require 'activerecord-import'
|
2
2
|
require 'kudzu'
|
3
3
|
|
4
|
+
if defined? Rails
|
5
|
+
require_relative 'active_record/railtie'
|
6
|
+
else
|
7
|
+
require_relative 'active_record/all'
|
8
|
+
end
|
9
|
+
|
4
10
|
module Kudzu
|
5
11
|
module Adapter
|
6
12
|
module ActiveRecord
|
13
|
+
class << self
|
14
|
+
@@chunk_size = 5*(1024**2)
|
15
|
+
|
16
|
+
def chunk_size
|
17
|
+
@@chunk_size
|
18
|
+
end
|
19
|
+
|
20
|
+
def chunk_size=(val)
|
21
|
+
@@chunk_size = val
|
22
|
+
end
|
23
|
+
end
|
7
24
|
end
|
8
25
|
end
|
9
26
|
end
|
10
27
|
|
11
28
|
Kudzu.adapter = Kudzu::Adapter::ActiveRecord
|
12
|
-
|
13
|
-
if defined? Railtie
|
14
|
-
ActiveSupport.on_load :active_record do
|
15
|
-
require_relative 'active_record/all'
|
16
|
-
end
|
17
|
-
else
|
18
|
-
require_relative 'active_record/all'
|
19
|
-
end
|
@@ -1,10 +1,6 @@
|
|
1
1
|
require_relative 'model/base'
|
2
2
|
require_relative 'model/page'
|
3
|
-
require_relative 'model/
|
3
|
+
require_relative 'model/chunk'
|
4
4
|
require_relative 'model/link'
|
5
5
|
require_relative 'frontier'
|
6
6
|
require_relative 'repository'
|
7
|
-
|
8
|
-
Kudzu::Page = Kudzu::Adapter::ActiveRecord::Page
|
9
|
-
Kudzu::Link = Kudzu::Adapter::ActiveRecord::Link
|
10
|
-
Kudzu::Content = Kudzu::Adapter::ActiveRecord::Content
|
@@ -2,9 +2,9 @@ module Kudzu
|
|
2
2
|
module Adapter
|
3
3
|
module ActiveRecord
|
4
4
|
class Page < Base
|
5
|
-
include Kudzu::
|
5
|
+
include Kudzu::Model::Page
|
6
6
|
|
7
|
-
|
7
|
+
has_many :chunks, -> { order(id: :asc) }, dependent: :delete_all
|
8
8
|
|
9
9
|
def response_header
|
10
10
|
if response_header_column_is_text?
|
@@ -22,6 +22,10 @@ module Kudzu
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
+
def data
|
26
|
+
chunks.pluck(:data).join
|
27
|
+
end
|
28
|
+
|
25
29
|
private
|
26
30
|
|
27
31
|
def response_header_column_is_text?
|
@@ -31,4 +35,6 @@ module Kudzu
|
|
31
35
|
end
|
32
36
|
end
|
33
37
|
end
|
38
|
+
|
39
|
+
Page = Adapter::ActiveRecord::Page
|
34
40
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Kudzu
|
2
|
+
module Adapter
|
3
|
+
module ActiveRecord
|
4
|
+
class Railtie < Rails::Railtie
|
5
|
+
ActiveSupport.on_load :active_record do
|
6
|
+
require_relative 'all'
|
7
|
+
end
|
8
|
+
|
9
|
+
config.after_initialize do
|
10
|
+
Dir.glob(Rails.root + 'app/decorators/kudzu/**/*_decorator*.rb').each do |c|
|
11
|
+
require_dependency(c)
|
12
|
+
end
|
13
|
+
Kudzu.logger = Rails.logger
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -3,20 +3,45 @@ module Kudzu
|
|
3
3
|
module ActiveRecord
|
4
4
|
class Repository
|
5
5
|
def find_by_url(url)
|
6
|
-
Page.
|
6
|
+
Page.find_or_initialize_by(url: url)
|
7
7
|
end
|
8
8
|
|
9
9
|
def register(page)
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
ActiveRecord::Base.transaction do
|
11
|
+
page.save
|
12
|
+
save_chunks(page) if page.body
|
13
13
|
end
|
14
|
-
page.save
|
15
14
|
end
|
16
15
|
|
17
16
|
def delete(page)
|
18
17
|
page.destroy if page
|
19
18
|
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
def save_chunks(page)
|
23
|
+
chunk_num = 0
|
24
|
+
each_chunk(page.body, ActiveRecord.chunk_size) do |chunked, i|
|
25
|
+
chunk = page.chunks.select_without_data.offset(i).limit(1).first_or_initialize
|
26
|
+
chunk.data = chunked
|
27
|
+
chunk.save
|
28
|
+
chunk_num = i + 1
|
29
|
+
end
|
30
|
+
|
31
|
+
if page.chunks.count - chunk_num > 0
|
32
|
+
page.chunks.select_without_data.offset(chunk_num).each(&:delete)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def each_chunk(body, size)
|
37
|
+
pos = 0
|
38
|
+
i = 0
|
39
|
+
while (chunked = body.byteslice(pos, size))
|
40
|
+
yield chunked, i
|
41
|
+
pos += size
|
42
|
+
i += 1
|
43
|
+
end
|
44
|
+
end
|
20
45
|
end
|
21
46
|
end
|
22
47
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kudzu-adapter-active_record
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoshikazu Kaneta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: kudzu
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1.
|
19
|
+
version: '1.1'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '1.
|
26
|
+
version: '1.1'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: activerecord
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -80,6 +80,34 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: mysql2
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: pg
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
83
111
|
- !ruby/object:Gem::Dependency
|
84
112
|
name: rspec-rails
|
85
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -160,16 +188,17 @@ files:
|
|
160
188
|
- README.md
|
161
189
|
- Rakefile
|
162
190
|
- lib/generators/kudzu/adapter/active_record/migration_generator.rb
|
163
|
-
- lib/generators/kudzu/adapter/active_record/templates/
|
191
|
+
- lib/generators/kudzu/adapter/active_record/templates/create_kudzu_chunks.rb.erb
|
164
192
|
- lib/generators/kudzu/adapter/active_record/templates/create_kudzu_links.rb.erb
|
165
193
|
- lib/generators/kudzu/adapter/active_record/templates/create_kudzu_pages.rb.erb
|
166
194
|
- lib/kudzu/adapter/active_record.rb
|
167
195
|
- lib/kudzu/adapter/active_record/all.rb
|
168
196
|
- lib/kudzu/adapter/active_record/frontier.rb
|
169
197
|
- lib/kudzu/adapter/active_record/model/base.rb
|
170
|
-
- lib/kudzu/adapter/active_record/model/
|
198
|
+
- lib/kudzu/adapter/active_record/model/chunk.rb
|
171
199
|
- lib/kudzu/adapter/active_record/model/link.rb
|
172
200
|
- lib/kudzu/adapter/active_record/model/page.rb
|
201
|
+
- lib/kudzu/adapter/active_record/railtie.rb
|
173
202
|
- lib/kudzu/adapter/active_record/repository.rb
|
174
203
|
- lib/kudzu/adapter/active_record/version.rb
|
175
204
|
homepage: https://github.com/kanety/kudzu-adapter-active_record
|