kudzu-adapter-active_record 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: b41da782783d660d5366be6d2c235d50722f8574
4
- data.tar.gz: f41f69bf3555773358301b07e9b6832c36424a79
2
+ SHA256:
3
+ metadata.gz: 6070cb29d79bb57e5dd856f975ad58ae27d04ec1d5155a8ff93c65530ea3032c
4
+ data.tar.gz: a4f62c4f71e36dae0d5d9061443a606e54a3954220d0f94b059b9e6d38dd8978
5
5
  SHA512:
6
- metadata.gz: 1e3f5cc57bb822445c288c3a999614a18811ae3e835205b44295c8ac38b804327254436bb85236545822de1e097e20af35971dccb8cfd7bc09ffcf80f3ff86a5
7
- data.tar.gz: d8626598ce2b61a69eb75f3d897c0b881847a2745bd36db63cb2443b6e5f24300c6e08562c219556c824c932f548ca65fece1303cf4be581771c71aa528eda26
6
+ metadata.gz: 9ebf5a08ea9fc737fd342b4eadf443c01caafb0e14a43f78a60aac3cd0dc8a269d63635aee857fa59674ab20b7a41a464a486876e8b5a19c730e328af68a1c31
7
+ data.tar.gz: 2257f8e31654c5512a1b217b5edf9a657440ec73b2b909ac67b717b9cc7c3ebbf81166e7921c1536c3ca2f00c77601a3d8951a18a9789b36fc4213b5cc6592ac
data/README.md CHANGED
@@ -4,7 +4,7 @@ ActiveRecord adapter for kudzu crawler.
4
4
 
5
5
  ## Dependencies
6
6
 
7
- * kudzu 1.0+
7
+ * kudzu 1.1+
8
8
  * activerecord 5.0+
9
9
 
10
10
  ## Installation
@@ -30,7 +30,7 @@ Migrate into your application:
30
30
  This migration creates following tables:
31
31
 
32
32
  * kudzu_pages
33
- * kudzu_contents
33
+ * kudzu_chunks
34
34
  * kudzu_links
35
35
 
36
36
  ## Usage
@@ -9,7 +9,7 @@ module Kudzu
9
9
  def create
10
10
  @migration_version = migration_version
11
11
  timestamp = Time.now.utc.strftime("%Y%m%d%H%M%S").to_i
12
- ["create_kudzu_pages", "create_kudzu_contents", "create_kudzu_links"].each_with_index do |filename, i|
12
+ ["create_kudzu_pages", "create_kudzu_chunks", "create_kudzu_links"].each_with_index do |filename, i|
13
13
  timestamp += i
14
14
  template "#{filename}.rb.erb", "db/migrate/#{timestamp}_#{filename}.rb"
15
15
  end
@@ -0,0 +1,9 @@
1
+ class CreateKudzuChunks < ActiveRecord::Migration<%= @migration_version %>
2
+ def change
3
+ create_table :kudzu_chunks do |t|
4
+ t.references :page
5
+ t.binary :data
6
+ t.timestamps null: false
7
+ end
8
+ end
9
+ end
@@ -13,8 +13,6 @@ class CreateKudzuPages < ActiveRecord::Migration<%= @migration_version %>
13
13
  t.text :redirect_from
14
14
  t.datetime :fetched_at
15
15
  t.datetime :revised_at
16
- t.integer :revisit_interval
17
- t.datetime :revisit_at
18
16
  t.timestamps null: false
19
17
 
20
18
  t.index :url, length: 32
@@ -1,10 +1,6 @@
1
1
  require_relative 'model/base'
2
2
  require_relative 'model/page'
3
- require_relative 'model/content'
3
+ require_relative 'model/chunk'
4
4
  require_relative 'model/link'
5
5
  require_relative 'frontier'
6
6
  require_relative 'repository'
7
-
8
- Kudzu::Page = Kudzu::Adapter::ActiveRecord::Page
9
- Kudzu::Link = Kudzu::Adapter::ActiveRecord::Link
10
- Kudzu::Content = Kudzu::Adapter::ActiveRecord::Content
@@ -10,7 +10,7 @@ module Kudzu
10
10
  def enqueue(links, depth: 0)
11
11
  @monitor.synchronize do
12
12
  links = filter_existing_urls(links)
13
- Link.import(links)
13
+ Link.bulk_import(links)
14
14
  links
15
15
  end
16
16
  end
@@ -0,0 +1,13 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Chunk < Base
5
+ belongs_to :page
6
+
7
+ scope :select_without_data, -> { select(column_names - %w(data)) }
8
+ end
9
+ end
10
+ end
11
+
12
+ Chunk = Adapter::ActiveRecord::Chunk
13
+ end
@@ -2,8 +2,10 @@ module Kudzu
2
2
  module Adapter
3
3
  module ActiveRecord
4
4
  class Link < Base
5
- include Kudzu::Adapter::Base::Link
5
+ include Kudzu::Model::Link
6
6
  end
7
7
  end
8
8
  end
9
+
10
+ Link = Adapter::ActiveRecord::Link
9
11
  end
@@ -2,9 +2,9 @@ module Kudzu
2
2
  module Adapter
3
3
  module ActiveRecord
4
4
  class Page < Base
5
- include Kudzu::Adapter::Base::Page
5
+ include Kudzu::Model::Page
6
6
 
7
- has_one :content, dependent: :destroy
7
+ has_many :chunks, -> { order(id: :asc) }, dependent: :delete_all
8
8
 
9
9
  def response_header
10
10
  if response_header_column_is_text?
@@ -22,6 +22,10 @@ module Kudzu
22
22
  end
23
23
  end
24
24
 
25
+ def data
26
+ chunks.pluck(:data).join
27
+ end
28
+
25
29
  private
26
30
 
27
31
  def response_header_column_is_text?
@@ -31,4 +35,6 @@ module Kudzu
31
35
  end
32
36
  end
33
37
  end
38
+
39
+ Page = Adapter::ActiveRecord::Page
34
40
  end
@@ -0,0 +1,18 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Railtie < Rails::Railtie
5
+ ActiveSupport.on_load :active_record do
6
+ require_relative 'all'
7
+ end
8
+
9
+ config.after_initialize do
10
+ Dir.glob(Rails.root + 'app/decorators/kudzu/**/*_decorator*.rb').each do |c|
11
+ require_dependency(c)
12
+ end
13
+ Kudzu.logger = Rails.logger
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -3,20 +3,45 @@ module Kudzu
3
3
  module ActiveRecord
4
4
  class Repository
5
5
  def find_by_url(url)
6
- Page.where(url: url).first_or_initialize
6
+ Page.find_or_initialize_by(url: url)
7
7
  end
8
8
 
9
9
  def register(page)
10
- if page.body
11
- content = page.content || page.build_content
12
- content.data = page.body
10
+ ActiveRecord::Base.transaction do
11
+ page.save
12
+ save_chunks(page) if page.body
13
13
  end
14
- page.save
15
14
  end
16
15
 
17
16
  def delete(page)
18
17
  page.destroy if page
19
18
  end
19
+
20
+ private
21
+
22
+ def save_chunks(page)
23
+ chunk_num = 0
24
+ each_chunk(page.body, ActiveRecord.chunk_size) do |chunked, i|
25
+ chunk = page.chunks.select_without_data.offset(i).limit(1).first_or_initialize
26
+ chunk.data = chunked
27
+ chunk.save
28
+ chunk_num = i + 1
29
+ end
30
+
31
+ if page.chunks.count - chunk_num > 0
32
+ page.chunks.select_without_data.offset(chunk_num).each(&:delete)
33
+ end
34
+ end
35
+
36
+ def each_chunk(body, size)
37
+ pos = 0
38
+ i = 0
39
+ while (chunked = body.byteslice(pos, size))
40
+ yield chunked, i
41
+ pos += size
42
+ i += 1
43
+ end
44
+ end
20
45
  end
21
46
  end
22
47
  end
@@ -1,7 +1,7 @@
1
1
  module Kudzu
2
2
  module Adapter
3
3
  module ActiveRecord
4
- VERSION = '1.0.0'
4
+ VERSION = '1.2.0'
5
5
  end
6
6
  end
7
7
  end
@@ -1,19 +1,28 @@
1
1
  require 'activerecord-import'
2
2
  require 'kudzu'
3
3
 
4
+ if defined? Rails
5
+ require_relative 'active_record/railtie'
6
+ else
7
+ require_relative 'active_record/all'
8
+ end
9
+
4
10
  module Kudzu
5
11
  module Adapter
6
12
  module ActiveRecord
13
+ class << self
14
+ @@chunk_size = 5*(1024**2)
15
+
16
+ def chunk_size
17
+ @@chunk_size
18
+ end
19
+
20
+ def chunk_size=(val)
21
+ @@chunk_size = val
22
+ end
23
+ end
7
24
  end
8
25
  end
9
26
  end
10
27
 
11
28
  Kudzu.adapter = Kudzu::Adapter::ActiveRecord
12
-
13
- if defined? Railtie
14
- ActiveSupport.on_load :active_record do
15
- require_relative 'active_record/all'
16
- end
17
- else
18
- require_relative 'active_record/all'
19
- end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kudzu-adapter-active_record
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoshikazu Kaneta
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-20 00:00:00.000000000 Z
11
+ date: 2024-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: kudzu
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '1.1'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '1.1'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: activerecord
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -67,7 +67,7 @@ dependencies:
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: sqlite3
70
+ name: webrick
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - ">="
@@ -81,7 +81,7 @@ dependencies:
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  - !ruby/object:Gem::Dependency
84
- name: rspec-rails
84
+ name: sqlite3
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - ">="
@@ -95,7 +95,7 @@ dependencies:
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
97
  - !ruby/object:Gem::Dependency
98
- name: simplecov
98
+ name: mysql2
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
101
  - - ">="
@@ -109,7 +109,7 @@ dependencies:
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
111
  - !ruby/object:Gem::Dependency
112
- name: pry-rails
112
+ name: pg
113
113
  requirement: !ruby/object:Gem::Requirement
114
114
  requirements:
115
115
  - - ">="
@@ -123,7 +123,7 @@ dependencies:
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
125
  - !ruby/object:Gem::Dependency
126
- name: pry-byebug
126
+ name: rspec-rails
127
127
  requirement: !ruby/object:Gem::Requirement
128
128
  requirements:
129
129
  - - ">="
@@ -137,7 +137,7 @@ dependencies:
137
137
  - !ruby/object:Gem::Version
138
138
  version: '0'
139
139
  - !ruby/object:Gem::Dependency
140
- name: database_cleaner
140
+ name: simplecov
141
141
  requirement: !ruby/object:Gem::Requirement
142
142
  requirements:
143
143
  - - ">="
@@ -160,23 +160,24 @@ files:
160
160
  - README.md
161
161
  - Rakefile
162
162
  - lib/generators/kudzu/adapter/active_record/migration_generator.rb
163
- - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_contents.rb.erb
163
+ - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_chunks.rb.erb
164
164
  - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_links.rb.erb
165
165
  - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_pages.rb.erb
166
166
  - lib/kudzu/adapter/active_record.rb
167
167
  - lib/kudzu/adapter/active_record/all.rb
168
168
  - lib/kudzu/adapter/active_record/frontier.rb
169
169
  - lib/kudzu/adapter/active_record/model/base.rb
170
- - lib/kudzu/adapter/active_record/model/content.rb
170
+ - lib/kudzu/adapter/active_record/model/chunk.rb
171
171
  - lib/kudzu/adapter/active_record/model/link.rb
172
172
  - lib/kudzu/adapter/active_record/model/page.rb
173
+ - lib/kudzu/adapter/active_record/railtie.rb
173
174
  - lib/kudzu/adapter/active_record/repository.rb
174
175
  - lib/kudzu/adapter/active_record/version.rb
175
176
  homepage: https://github.com/kanety/kudzu-adapter-active_record
176
177
  licenses:
177
178
  - MIT
178
179
  metadata: {}
179
- post_install_message:
180
+ post_install_message:
180
181
  rdoc_options: []
181
182
  require_paths:
182
183
  - lib
@@ -191,9 +192,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
191
192
  - !ruby/object:Gem::Version
192
193
  version: '0'
193
194
  requirements: []
194
- rubyforge_project:
195
- rubygems_version: 2.5.2.2
196
- signing_key:
195
+ rubygems_version: 3.3.3
196
+ signing_key:
197
197
  specification_version: 4
198
198
  summary: ActiveRecord adapter for kudzu crawler
199
199
  test_files: []
@@ -1,9 +0,0 @@
1
- class CreateKudzuContents < ActiveRecord::Migration<%= @migration_version %>
2
- def change
3
- create_table :kudzu_contents do |t|
4
- t.references :page
5
- t.binary :data
6
- t.timestamps null: false
7
- end
8
- end
9
- end
@@ -1,9 +0,0 @@
1
- module Kudzu
2
- module Adapter
3
- module ActiveRecord
4
- class Content < Base
5
- belongs_to :page
6
- end
7
- end
8
- end
9
- end