kudzu-adapter-active_record 1.0.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: b41da782783d660d5366be6d2c235d50722f8574
4
- data.tar.gz: f41f69bf3555773358301b07e9b6832c36424a79
2
+ SHA256:
3
+ metadata.gz: 6070cb29d79bb57e5dd856f975ad58ae27d04ec1d5155a8ff93c65530ea3032c
4
+ data.tar.gz: a4f62c4f71e36dae0d5d9061443a606e54a3954220d0f94b059b9e6d38dd8978
5
5
  SHA512:
6
- metadata.gz: 1e3f5cc57bb822445c288c3a999614a18811ae3e835205b44295c8ac38b804327254436bb85236545822de1e097e20af35971dccb8cfd7bc09ffcf80f3ff86a5
7
- data.tar.gz: d8626598ce2b61a69eb75f3d897c0b881847a2745bd36db63cb2443b6e5f24300c6e08562c219556c824c932f548ca65fece1303cf4be581771c71aa528eda26
6
+ metadata.gz: 9ebf5a08ea9fc737fd342b4eadf443c01caafb0e14a43f78a60aac3cd0dc8a269d63635aee857fa59674ab20b7a41a464a486876e8b5a19c730e328af68a1c31
7
+ data.tar.gz: 2257f8e31654c5512a1b217b5edf9a657440ec73b2b909ac67b717b9cc7c3ebbf81166e7921c1536c3ca2f00c77601a3d8951a18a9789b36fc4213b5cc6592ac
data/README.md CHANGED
@@ -4,7 +4,7 @@ ActiveRecord adapter for kudzu crawler.
4
4
 
5
5
  ## Dependencies
6
6
 
7
- * kudzu 1.0+
7
+ * kudzu 1.1+
8
8
  * activerecord 5.0+
9
9
 
10
10
  ## Installation
@@ -30,7 +30,7 @@ Migrate into your application:
30
30
  This migration creates following tables:
31
31
 
32
32
  * kudzu_pages
33
- * kudzu_contents
33
+ * kudzu_chunks
34
34
  * kudzu_links
35
35
 
36
36
  ## Usage
@@ -9,7 +9,7 @@ module Kudzu
9
9
  def create
10
10
  @migration_version = migration_version
11
11
  timestamp = Time.now.utc.strftime("%Y%m%d%H%M%S").to_i
12
- ["create_kudzu_pages", "create_kudzu_contents", "create_kudzu_links"].each_with_index do |filename, i|
12
+ ["create_kudzu_pages", "create_kudzu_chunks", "create_kudzu_links"].each_with_index do |filename, i|
13
13
  timestamp += i
14
14
  template "#{filename}.rb.erb", "db/migrate/#{timestamp}_#{filename}.rb"
15
15
  end
@@ -0,0 +1,9 @@
1
+ class CreateKudzuChunks < ActiveRecord::Migration<%= @migration_version %>
2
+ def change
3
+ create_table :kudzu_chunks do |t|
4
+ t.references :page
5
+ t.binary :data
6
+ t.timestamps null: false
7
+ end
8
+ end
9
+ end
@@ -13,8 +13,6 @@ class CreateKudzuPages < ActiveRecord::Migration<%= @migration_version %>
13
13
  t.text :redirect_from
14
14
  t.datetime :fetched_at
15
15
  t.datetime :revised_at
16
- t.integer :revisit_interval
17
- t.datetime :revisit_at
18
16
  t.timestamps null: false
19
17
 
20
18
  t.index :url, length: 32
@@ -1,10 +1,6 @@
1
1
  require_relative 'model/base'
2
2
  require_relative 'model/page'
3
- require_relative 'model/content'
3
+ require_relative 'model/chunk'
4
4
  require_relative 'model/link'
5
5
  require_relative 'frontier'
6
6
  require_relative 'repository'
7
-
8
- Kudzu::Page = Kudzu::Adapter::ActiveRecord::Page
9
- Kudzu::Link = Kudzu::Adapter::ActiveRecord::Link
10
- Kudzu::Content = Kudzu::Adapter::ActiveRecord::Content
@@ -10,7 +10,7 @@ module Kudzu
10
10
  def enqueue(links, depth: 0)
11
11
  @monitor.synchronize do
12
12
  links = filter_existing_urls(links)
13
- Link.import(links)
13
+ Link.bulk_import(links)
14
14
  links
15
15
  end
16
16
  end
@@ -0,0 +1,13 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Chunk < Base
5
+ belongs_to :page
6
+
7
+ scope :select_without_data, -> { select(column_names - %w(data)) }
8
+ end
9
+ end
10
+ end
11
+
12
+ Chunk = Adapter::ActiveRecord::Chunk
13
+ end
@@ -2,8 +2,10 @@ module Kudzu
2
2
  module Adapter
3
3
  module ActiveRecord
4
4
  class Link < Base
5
- include Kudzu::Adapter::Base::Link
5
+ include Kudzu::Model::Link
6
6
  end
7
7
  end
8
8
  end
9
+
10
+ Link = Adapter::ActiveRecord::Link
9
11
  end
@@ -2,9 +2,9 @@ module Kudzu
2
2
  module Adapter
3
3
  module ActiveRecord
4
4
  class Page < Base
5
- include Kudzu::Adapter::Base::Page
5
+ include Kudzu::Model::Page
6
6
 
7
- has_one :content, dependent: :destroy
7
+ has_many :chunks, -> { order(id: :asc) }, dependent: :delete_all
8
8
 
9
9
  def response_header
10
10
  if response_header_column_is_text?
@@ -22,6 +22,10 @@ module Kudzu
22
22
  end
23
23
  end
24
24
 
25
+ def data
26
+ chunks.pluck(:data).join
27
+ end
28
+
25
29
  private
26
30
 
27
31
  def response_header_column_is_text?
@@ -31,4 +35,6 @@ module Kudzu
31
35
  end
32
36
  end
33
37
  end
38
+
39
+ Page = Adapter::ActiveRecord::Page
34
40
  end
@@ -0,0 +1,18 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Railtie < Rails::Railtie
5
+ ActiveSupport.on_load :active_record do
6
+ require_relative 'all'
7
+ end
8
+
9
+ config.after_initialize do
10
+ Dir.glob(Rails.root + 'app/decorators/kudzu/**/*_decorator*.rb').each do |c|
11
+ require_dependency(c)
12
+ end
13
+ Kudzu.logger = Rails.logger
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -3,20 +3,45 @@ module Kudzu
3
3
  module ActiveRecord
4
4
  class Repository
5
5
  def find_by_url(url)
6
- Page.where(url: url).first_or_initialize
6
+ Page.find_or_initialize_by(url: url)
7
7
  end
8
8
 
9
9
  def register(page)
10
- if page.body
11
- content = page.content || page.build_content
12
- content.data = page.body
10
+ ActiveRecord::Base.transaction do
11
+ page.save
12
+ save_chunks(page) if page.body
13
13
  end
14
- page.save
15
14
  end
16
15
 
17
16
  def delete(page)
18
17
  page.destroy if page
19
18
  end
19
+
20
+ private
21
+
22
+ def save_chunks(page)
23
+ chunk_num = 0
24
+ each_chunk(page.body, ActiveRecord.chunk_size) do |chunked, i|
25
+ chunk = page.chunks.select_without_data.offset(i).limit(1).first_or_initialize
26
+ chunk.data = chunked
27
+ chunk.save
28
+ chunk_num = i + 1
29
+ end
30
+
31
+ if page.chunks.count - chunk_num > 0
32
+ page.chunks.select_without_data.offset(chunk_num).each(&:delete)
33
+ end
34
+ end
35
+
36
+ def each_chunk(body, size)
37
+ pos = 0
38
+ i = 0
39
+ while (chunked = body.byteslice(pos, size))
40
+ yield chunked, i
41
+ pos += size
42
+ i += 1
43
+ end
44
+ end
20
45
  end
21
46
  end
22
47
  end
@@ -1,7 +1,7 @@
1
1
  module Kudzu
2
2
  module Adapter
3
3
  module ActiveRecord
4
- VERSION = '1.0.0'
4
+ VERSION = '1.2.0'
5
5
  end
6
6
  end
7
7
  end
@@ -1,19 +1,28 @@
1
1
  require 'activerecord-import'
2
2
  require 'kudzu'
3
3
 
4
+ if defined? Rails
5
+ require_relative 'active_record/railtie'
6
+ else
7
+ require_relative 'active_record/all'
8
+ end
9
+
4
10
  module Kudzu
5
11
  module Adapter
6
12
  module ActiveRecord
13
+ class << self
14
+ @@chunk_size = 5*(1024**2)
15
+
16
+ def chunk_size
17
+ @@chunk_size
18
+ end
19
+
20
+ def chunk_size=(val)
21
+ @@chunk_size = val
22
+ end
23
+ end
7
24
  end
8
25
  end
9
26
  end
10
27
 
11
28
  Kudzu.adapter = Kudzu::Adapter::ActiveRecord
12
-
13
- if defined? Railtie
14
- ActiveSupport.on_load :active_record do
15
- require_relative 'active_record/all'
16
- end
17
- else
18
- require_relative 'active_record/all'
19
- end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kudzu-adapter-active_record
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoshikazu Kaneta
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-20 00:00:00.000000000 Z
11
+ date: 2024-01-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: kudzu
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '1.0'
19
+ version: '1.1'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '1.0'
26
+ version: '1.1'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: activerecord
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -67,7 +67,7 @@ dependencies:
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
69
  - !ruby/object:Gem::Dependency
70
- name: sqlite3
70
+ name: webrick
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - ">="
@@ -81,7 +81,7 @@ dependencies:
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
83
  - !ruby/object:Gem::Dependency
84
- name: rspec-rails
84
+ name: sqlite3
85
85
  requirement: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - ">="
@@ -95,7 +95,7 @@ dependencies:
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
97
  - !ruby/object:Gem::Dependency
98
- name: simplecov
98
+ name: mysql2
99
99
  requirement: !ruby/object:Gem::Requirement
100
100
  requirements:
101
101
  - - ">="
@@ -109,7 +109,7 @@ dependencies:
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
111
  - !ruby/object:Gem::Dependency
112
- name: pry-rails
112
+ name: pg
113
113
  requirement: !ruby/object:Gem::Requirement
114
114
  requirements:
115
115
  - - ">="
@@ -123,7 +123,7 @@ dependencies:
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
125
  - !ruby/object:Gem::Dependency
126
- name: pry-byebug
126
+ name: rspec-rails
127
127
  requirement: !ruby/object:Gem::Requirement
128
128
  requirements:
129
129
  - - ">="
@@ -137,7 +137,7 @@ dependencies:
137
137
  - !ruby/object:Gem::Version
138
138
  version: '0'
139
139
  - !ruby/object:Gem::Dependency
140
- name: database_cleaner
140
+ name: simplecov
141
141
  requirement: !ruby/object:Gem::Requirement
142
142
  requirements:
143
143
  - - ">="
@@ -160,23 +160,24 @@ files:
160
160
  - README.md
161
161
  - Rakefile
162
162
  - lib/generators/kudzu/adapter/active_record/migration_generator.rb
163
- - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_contents.rb.erb
163
+ - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_chunks.rb.erb
164
164
  - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_links.rb.erb
165
165
  - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_pages.rb.erb
166
166
  - lib/kudzu/adapter/active_record.rb
167
167
  - lib/kudzu/adapter/active_record/all.rb
168
168
  - lib/kudzu/adapter/active_record/frontier.rb
169
169
  - lib/kudzu/adapter/active_record/model/base.rb
170
- - lib/kudzu/adapter/active_record/model/content.rb
170
+ - lib/kudzu/adapter/active_record/model/chunk.rb
171
171
  - lib/kudzu/adapter/active_record/model/link.rb
172
172
  - lib/kudzu/adapter/active_record/model/page.rb
173
+ - lib/kudzu/adapter/active_record/railtie.rb
173
174
  - lib/kudzu/adapter/active_record/repository.rb
174
175
  - lib/kudzu/adapter/active_record/version.rb
175
176
  homepage: https://github.com/kanety/kudzu-adapter-active_record
176
177
  licenses:
177
178
  - MIT
178
179
  metadata: {}
179
- post_install_message:
180
+ post_install_message:
180
181
  rdoc_options: []
181
182
  require_paths:
182
183
  - lib
@@ -191,9 +192,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
191
192
  - !ruby/object:Gem::Version
192
193
  version: '0'
193
194
  requirements: []
194
- rubyforge_project:
195
- rubygems_version: 2.5.2.2
196
- signing_key:
195
+ rubygems_version: 3.3.3
196
+ signing_key:
197
197
  specification_version: 4
198
198
  summary: ActiveRecord adapter for kudzu crawler
199
199
  test_files: []
@@ -1,9 +0,0 @@
1
- class CreateKudzuContents < ActiveRecord::Migration<%= @migration_version %>
2
- def change
3
- create_table :kudzu_contents do |t|
4
- t.references :page
5
- t.binary :data
6
- t.timestamps null: false
7
- end
8
- end
9
- end
@@ -1,9 +0,0 @@
1
- module Kudzu
2
- module Adapter
3
- module ActiveRecord
4
- class Content < Base
5
- belongs_to :page
6
- end
7
- end
8
- end
9
- end