kudzu-adapter-active_record 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b41da782783d660d5366be6d2c235d50722f8574
4
+ data.tar.gz: f41f69bf3555773358301b07e9b6832c36424a79
5
+ SHA512:
6
+ metadata.gz: 1e3f5cc57bb822445c288c3a999614a18811ae3e835205b44295c8ac38b804327254436bb85236545822de1e097e20af35971dccb8cfd7bc09ffcf80f3ff86a5
7
+ data.tar.gz: d8626598ce2b61a69eb75f3d897c0b881847a2745bd36db63cb2443b6e5f24300c6e08562c219556c824c932f548ca65fece1303cf4be581771c71aa528eda26
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # Kudzu::Adapter::ActiveRecord
2
+
3
+ ActiveRecord adapter for kudzu crawler.
4
+
5
+ ## Dependencies
6
+
7
+ * kudzu 1.0+
8
+ * activerecord 5.0+
9
+
10
+ ## Installation
11
+
12
+ Add to your application's Gemfile:
13
+
14
+ ```ruby
15
+ gem 'kudzu-adapter-active_record'
16
+ ```
17
+
18
+ Then run:
19
+
20
+ $ bundle install
21
+
22
+ Create migration files:
23
+
24
+ $ rails generate kudzu:adapter:active_record:migration
25
+
26
+ Migrate into your application:
27
+
28
+ $ bundle exec rake db:migrate
29
+
30
+ This migration creates following tables:
31
+
32
+ * kudzu_pages
33
+ * kudzu_contents
34
+ * kudzu_links
35
+
36
+ ## Usage
37
+
38
+ Basic usage is same as [kudzu gem](https://github.com/kanety/kudzu).
39
+
40
+ ## Contributing
41
+
42
+ Bug reports and pull requests are welcome on GitHub at https://github.com/kanety/kudzu-adapter-active_record. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
43
+
44
+ ## License
45
+
46
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,28 @@
1
+ require 'rails/generators'
2
+
3
+ module Kudzu
4
+ module Adapter
5
+ module ActiveRecord
6
+ class MigrationGenerator < Rails::Generators::Base
7
+ source_root File.join(File.dirname(__FILE__), 'templates')
8
+
9
+ def create
10
+ @migration_version = migration_version
11
+ timestamp = Time.now.utc.strftime("%Y%m%d%H%M%S").to_i
12
+ ["create_kudzu_pages", "create_kudzu_contents", "create_kudzu_links"].each_with_index do |filename, i|
13
+ timestamp += i
14
+ template "#{filename}.rb.erb", "db/migrate/#{timestamp}_#{filename}.rb"
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def migration_version
21
+ if ::ActiveRecord::VERSION::MAJOR >= 5
22
+ "[#{::ActiveRecord::VERSION::MAJOR}.#{::ActiveRecord::VERSION::MINOR}]"
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,9 @@
1
+ class CreateKudzuContents < ActiveRecord::Migration<%= @migration_version %>
2
+ def change
3
+ create_table :kudzu_contents do |t|
4
+ t.references :page
5
+ t.binary :data
6
+ t.timestamps null: false
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,16 @@
1
+ class CreateKudzuLinks < ActiveRecord::Migration<%= @migration_version %>
2
+ def change
3
+ create_table :kudzu_links do |t|
4
+ t.text :uuid
5
+ t.text :url
6
+ t.text :title
7
+ t.integer :state
8
+ t.integer :depth
9
+ t.timestamps null: false
10
+
11
+ t.index :uuid, length: 4
12
+ t.index :url, length: 32
13
+ t.index :state
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,24 @@
1
+ class CreateKudzuPages < ActiveRecord::Migration<%= @migration_version %>
2
+ def change
3
+ create_table :kudzu_pages do |t|
4
+ t.text :url
5
+ t.text :title
6
+ t.integer :status
7
+ t.text :mime_type
8
+ t.integer :size
9
+ t.text :charset
10
+ t.text :digest
11
+ t.text :response_header
12
+ t.float :response_time
13
+ t.text :redirect_from
14
+ t.datetime :fetched_at
15
+ t.datetime :revised_at
16
+ t.integer :revisit_interval
17
+ t.datetime :revisit_at
18
+ t.timestamps null: false
19
+
20
+ t.index :url, length: 32
21
+ t.index :digest, length: 6
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,19 @@
1
+ require 'activerecord-import'
2
+ require 'kudzu'
3
+
4
+ module Kudzu
5
+ module Adapter
6
+ module ActiveRecord
7
+ end
8
+ end
9
+ end
10
+
11
+ Kudzu.adapter = Kudzu::Adapter::ActiveRecord
12
+
13
+ if defined? Railtie
14
+ ActiveSupport.on_load :active_record do
15
+ require_relative 'active_record/all'
16
+ end
17
+ else
18
+ require_relative 'active_record/all'
19
+ end
@@ -0,0 +1,10 @@
1
+ require_relative 'model/base'
2
+ require_relative 'model/page'
3
+ require_relative 'model/content'
4
+ require_relative 'model/link'
5
+ require_relative 'frontier'
6
+ require_relative 'repository'
7
+
8
+ Kudzu::Page = Kudzu::Adapter::ActiveRecord::Page
9
+ Kudzu::Link = Kudzu::Adapter::ActiveRecord::Link
10
+ Kudzu::Content = Kudzu::Adapter::ActiveRecord::Content
@@ -0,0 +1,43 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Frontier
5
+ def initialize(uuid, config = {})
6
+ @uuid = uuid
7
+ @monitor = Monitor.new
8
+ end
9
+
10
+ def enqueue(links, depth: 0)
11
+ @monitor.synchronize do
12
+ links = filter_existing_urls(links)
13
+ Link.import(links)
14
+ links
15
+ end
16
+ end
17
+
18
+ def dequeue(limit: 1)
19
+ @monitor.synchronize do
20
+ links = Link.where(uuid: @uuid, state: 0).order(id: :asc).limit(limit).to_a
21
+ links.each do |link|
22
+ link.state = 1
23
+ link.save
24
+ end
25
+ links
26
+ end
27
+ end
28
+
29
+ def clear
30
+ Link.where(uuid: @uuid).delete_all
31
+ end
32
+
33
+ private
34
+
35
+ def filter_existing_urls(links)
36
+ urls = Array(links).map { |link| link.url }
37
+ existed_urls = Link.where(uuid: @uuid, url: urls).pluck(:url)
38
+ links.select { |link| !existed_urls.include?(link.url) }
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,15 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Base < ::ActiveRecord::Base
5
+ self.abstract_class = true
6
+
7
+ class << self
8
+ def table_name_prefix
9
+ 'kudzu_'
10
+ end
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,9 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Content < Base
5
+ belongs_to :page
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Link < Base
5
+ include Kudzu::Adapter::Base::Link
6
+ end
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,34 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Page < Base
5
+ include Kudzu::Adapter::Base::Page
6
+
7
+ has_one :content, dependent: :destroy
8
+
9
+ def response_header
10
+ if response_header_column_is_text?
11
+ JSON.parse(super)
12
+ else
13
+ super
14
+ end
15
+ end
16
+
17
+ def response_header=(val)
18
+ if response_header_column_is_text?
19
+ super(JSON.generate(val))
20
+ else
21
+ super
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def response_header_column_is_text?
28
+ type = self.class.columns_hash["response_header"].type
29
+ type == :text || type == :string
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,23 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ class Repository
5
+ def find_by_url(url)
6
+ Page.where(url: url).first_or_initialize
7
+ end
8
+
9
+ def register(page)
10
+ if page.body
11
+ content = page.content || page.build_content
12
+ content.data = page.body
13
+ end
14
+ page.save
15
+ end
16
+
17
+ def delete(page)
18
+ page.destroy if page
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,7 @@
1
+ module Kudzu
2
+ module Adapter
3
+ module ActiveRecord
4
+ VERSION = '1.0.0'
5
+ end
6
+ end
7
+ end
metadata ADDED
@@ -0,0 +1,199 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kudzu-adapter-active_record
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Yoshikazu Kaneta
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-12-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: kudzu
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '1.0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: activerecord
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: activerecord-import
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rails
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: sqlite3
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec-rails
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: simplecov
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: pry-rails
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: pry-byebug
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: database_cleaner
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ description: ActiveRecord adapter for kudzu crawler
154
+ email:
155
+ - kaneta@sitebridge.co.jp
156
+ executables: []
157
+ extensions: []
158
+ extra_rdoc_files: []
159
+ files:
160
+ - README.md
161
+ - Rakefile
162
+ - lib/generators/kudzu/adapter/active_record/migration_generator.rb
163
+ - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_contents.rb.erb
164
+ - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_links.rb.erb
165
+ - lib/generators/kudzu/adapter/active_record/templates/create_kudzu_pages.rb.erb
166
+ - lib/kudzu/adapter/active_record.rb
167
+ - lib/kudzu/adapter/active_record/all.rb
168
+ - lib/kudzu/adapter/active_record/frontier.rb
169
+ - lib/kudzu/adapter/active_record/model/base.rb
170
+ - lib/kudzu/adapter/active_record/model/content.rb
171
+ - lib/kudzu/adapter/active_record/model/link.rb
172
+ - lib/kudzu/adapter/active_record/model/page.rb
173
+ - lib/kudzu/adapter/active_record/repository.rb
174
+ - lib/kudzu/adapter/active_record/version.rb
175
+ homepage: https://github.com/kanety/kudzu-adapter-active_record
176
+ licenses:
177
+ - MIT
178
+ metadata: {}
179
+ post_install_message:
180
+ rdoc_options: []
181
+ require_paths:
182
+ - lib
183
+ required_ruby_version: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ required_rubygems_version: !ruby/object:Gem::Requirement
189
+ requirements:
190
+ - - ">="
191
+ - !ruby/object:Gem::Version
192
+ version: '0'
193
+ requirements: []
194
+ rubyforge_project:
195
+ rubygems_version: 2.5.2.2
196
+ signing_key:
197
+ specification_version: 4
198
+ summary: ActiveRecord adapter for kudzu crawler
199
+ test_files: []