kudzu-adapter-active_record 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b41da782783d660d5366be6d2c235d50722f8574
4
+ data.tar.gz: f41f69bf3555773358301b07e9b6832c36424a79
5
+ SHA512:
6
+ metadata.gz: 1e3f5cc57bb822445c288c3a999614a18811ae3e835205b44295c8ac38b804327254436bb85236545822de1e097e20af35971dccb8cfd7bc09ffcf80f3ff86a5
7
+ data.tar.gz: d8626598ce2b61a69eb75f3d897c0b881847a2745bd36db63cb2443b6e5f24300c6e08562c219556c824c932f548ca65fece1303cf4be581771c71aa528eda26
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # Kudzu::Adapter::ActiveRecord
2
+
3
+ ActiveRecord adapter for kudzu crawler.
4
+
5
+ ## Dependencies
6
+
7
+ * kudzu 1.0+
8
+ * activerecord 5.0+
9
+
10
+ ## Installation
11
+
12
+ Add to your application's Gemfile:
13
+
14
+ ```ruby
15
+ gem 'kudzu-adapter-active_record'
16
+ ```
17
+
18
+ Then run:
19
+
20
+ $ bundle install
21
+
22
+ Create migration files:
23
+
24
+ $ rails generate kudzu:adapter:active_record:migration
25
+
26
+ Migrate into your application:
27
+
28
+ $ bundle exec rake db:migrate
29
+
30
+ This migration creates following tables:
31
+
32
+ * kudzu_pages
33
+ * kudzu_contents
34
+ * kudzu_links
35
+
36
+ ## Usage
37
+
38
+ Basic usage is same as [kudzu gem](https://github.com/kanety/kudzu).
39
+
40
+ ## Contributing
41
+
42
+ Bug reports and pull requests are welcome on GitHub at https://github.com/kanety/kudzu-adapter-active_record. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
43
+
44
+ ## License
45
+
46
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,28 @@
1
+ require 'rails/generators'
2
+
3
+ module Kudzu
4
+ module Adapter
5
+ module ActiveRecord
6
+ class MigrationGenerator < Rails::Generators::Base
7
+ source_root File.join(File.dirname(__FILE__), 'templates')
8
+
9
+ def create
10
+ @migration_version = migration_version
11
+ timestamp = Time.now.utc.strftime("%Y%m%d%H%M%S").to_i
12
+ ["create_kudzu_pages", "create_kudzu_contents", "create_kudzu_links"].each_with_index do |filename, i|
13
+ timestamp += i
14
+ template "#{filename}.rb.erb", "db/migrate/#{timestamp}_#{filename}.rb"
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def migration_version
21
+ if ::ActiveRecord::VERSION::MAJOR >= 5
22
+ "[#{::ActiveRecord::VERSION::MAJOR}.#{::ActiveRecord::VERSION::MINOR}]"
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,9 @@
1
+ class CreateKudzuContents < ActiveRecord::Migration<%= @migration_version %>
2
+ def change
3
+ create_table :kudzu_contents do |t|
4
+ t.references :page
5
+ t.binary :data
6
+ t.timestamps null: false
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,16 @@
1
+ class CreateKudzuLinks < ActiveRecord::Migration<%= @migration_version %>
2
+ def change
3
+ create_table :kudzu_links do |t|
4
+ t.text :uuid
5
+ t.text :url
6
+ t.text :title
7
+ t.integer :state
8
+ t.integer :depth
9
+ t.timestamps null: false
10
+
11
+ t.index :uuid, length: 4
12
+ t.index :url, length: 32
13
+ t.index :state
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,24 @@
1
+ class CreateKudzuPages < ActiveRecord::Migration<%= @migration_version %>
2
+ def change
3
+ create_table :kudzu_pages do |t|
4
+ t.text :url
5
+ t.text :title
6
+ t.integer :status
7
+ t.text :mime_type
8
+ t.integer :size
9
+ t.text :charset
10
+ t.text :digest
11
+ t.text :response_header
12
+ t.float :response_time
13
+ t.text :redirect_from
14
+ t.datetime :fetched_at
15
+ t.datetime :revised_at
16
+ t.integer :revisit_interval
17
+ t.datetime :revisit_at
18
+ t.timestamps null: false
19
+
20
+ t.index :url, length: 32
21
+ t.index :digest, length: 6
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,19 @@
1
+ require 'activerecord-import'
2
+ require 'kudzu'
3
+
4
+ module Kudzu
5
+ module Adapter
6
+ module ActiveRecord
7
+ end
8
+ end
9
+ end
10
+
11
+ Kudzu.adapter = Kudzu::Adapter::ActiveRecord
12
+
13
+ if defined? Railtie
14
+ ActiveSupport.on_load :active_record do
15
+ require_relative 'active_record/all'
16
+ end
17
+ else
18
+ require_relative 'active_record/all'
19
+ end
@@ -0,0 +1,10 @@
1
+ require_relative 'model/base'
2
+ require_relative 'model/page'
3
+ require_relative 'model/content'
4
+ require_relative 'model/link'
5
+ require_relative 'frontier'
6
+ require_relative 'repository'
7
+
8
+ Kudzu::Page = Kudzu::Adapter::ActiveRecord::Page
9
+ Kudzu::Link = Kudzu::Adapter::ActiveRecord::Link
10
+ Kudzu::Content = Kudzu::Adapter::ActiveRecord::Content
@@ -0,0 +1,43 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Frontier
5
+ def initialize(uuid, config = {})
6
+ @uuid = uuid
7
+ @monitor = Monitor.new
8
+ end
9
+
10
+ def enqueue(links, depth: 0)
11
+ @monitor.synchronize do
12
+ links = filter_existing_urls(links)
13
+ Link.import(links)
14
+ links
15
+ end
16
+ end
17
+
18
+ def dequeue(limit: 1)
19
+ @monitor.synchronize do
20
+ links = Link.where(uuid: @uuid, state: 0).order(id: :asc).limit(limit).to_a
21
+ links.each do |link|
22
+ link.state = 1
23
+ link.save
24
+ end
25
+ links
26
+ end
27
+ end
28
+
29
+ def clear
30
+ Link.where(uuid: @uuid).delete_all
31
+ end
32
+
33
+ private
34
+
35
+ def filter_existing_urls(links)
36
+ urls = Array(links).map { |link| link.url }
37
+ existed_urls = Link.where(uuid: @uuid, url: urls).pluck(:url)
38
+ links.select { |link| !existed_urls.include?(link.url) }
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,15 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Base < ::ActiveRecord::Base
5
+ self.abstract_class = true
6
+
7
+ class << self
8
+ def table_name_prefix
9
+ 'kudzu_'
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,9 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Content < Base
5
+ belongs_to :page
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Link < Base
5
+ include Kudzu::Adapter::Base::Link
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,34 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Page < Base
5
+ include Kudzu::Adapter::Base::Page
6
+
7
+ has_one :content, dependent: :destroy
8
+
9
+ def response_header
10
+ if response_header_column_is_text?
11
+ JSON.parse(super)
12
+ else
13
+ super
14
+ end
15
+ end
16
+
17
+ def response_header=(val)
18
+ if response_header_column_is_text?
19
+ super(JSON.generate(val))
20
+ else
21
+ super
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def response_header_column_is_text?
28
+ type = self.class.columns_hash["response_header"].type
29
+ type == :text || type == :string
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,23 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Repository
5
+ def find_by_url(url)
6
+ Page.where(url: url).first_or_initialize
7
+ end
8
+
9
+ def register(page)
10
+ if page.body
11
+ content = page.content || page.build_content
12
+ content.data = page.body
13
+ end
14
+ page.save
15
+ end
16
+
17
+ def delete(page)
18
+ page.destroy if page
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,7 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ VERSION = '1.0.0'
5
+ end
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,199 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kudzu-adapter-active_record
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Yoshikazu Kaneta
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-12-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: kudzu
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: activerecord
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activerecord-import
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rails
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: sqlite3
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec-rails
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: simplecov
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: pry-rails
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: pry-byebug
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: database_cleaner
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ description: ActiveRecord adapter for kudzu crawler
154
+ email:
155
+ - kaneta@sitebridge.co.jp
156
+ executables: []
157
+ extensions: []
158
+ extra_rdoc_files: []
159
+ files:
160
+ - README.md
161
+ - Rakefile
162
+ - lib/generators/kudzu/adapter/active_record/migration_generator.rb
163
+ - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_contents.rb.erb
164
+ - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_links.rb.erb
165
+ - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_pages.rb.erb
166
+ - lib/kudzu/adapter/active_record.rb
167
+ - lib/kudzu/adapter/active_record/all.rb
168
+ - lib/kudzu/adapter/active_record/frontier.rb
169
+ - lib/kudzu/adapter/active_record/model/base.rb
170
+ - lib/kudzu/adapter/active_record/model/content.rb
171
+ - lib/kudzu/adapter/active_record/model/link.rb
172
+ - lib/kudzu/adapter/active_record/model/page.rb
173
+ - lib/kudzu/adapter/active_record/repository.rb
174
+ - lib/kudzu/adapter/active_record/version.rb
175
+ homepage: https://github.com/kanety/kudzu-adapter-active_record
176
+ licenses:
177
+ - MIT
178
+ metadata: {}
179
+ post_install_message:
180
+ rdoc_options: []
181
+ require_paths:
182
+ - lib
183
+ required_ruby_version: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ required_rubygems_version: !ruby/object:Gem::Requirement
189
+ requirements:
190
+ - - ">="
191
+ - !ruby/object:Gem::Version
192
+ version: '0'
193
+ requirements: []
194
+ rubyforge_project:
195
+ rubygems_version: 2.5.2.2
196
+ signing_key:
197
+ specification_version: 4
198
+ summary: ActiveRecord adapter for kudzu crawler
199
+ test_files: []