daimon_skycrawlers 0.7.2 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/daimon_skycrawlers/filter/update_checker.rb +3 -0
- data/lib/daimon_skycrawlers/generator/new.rb +17 -1
- data/lib/daimon_skycrawlers/storage/rdb.rb +2 -2
- data/lib/daimon_skycrawlers/tasks/database_tasks.rake +1 -1
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/sample/itp-crawler/Gemfile.lock +4 -5
- data/sample/itp-crawler/db/migrate/{20161018044144_create_page.rb → 20161018044144_create_pages.rb} +4 -2
- data/sample/itp-crawler/db/schema.rb +2 -0
- data/sample/spider/db/migrate/20160830155803_create_pages.rb +3 -1
- data/sample/spider/db/schema.rb +2 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3c9d264378336af1acadb1bec4dc2dba783d9089
|
4
|
+
data.tar.gz: c532cd7213889299f369ba2e8e682ea69a39172a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 86bcf270ee971a26cd029ab9bd9372cfbe544e15640b3275e897e8b0ec2131976db379330a047ef5ea03bf9f568c111e083a135ae1fddd3d0ead38a373086e33
|
7
|
+
data.tar.gz: a9aef67c6bcb70488a1ce0c5e54ed565f65e7a5490b2e0b7db87feb2960801bb829467d991ae4fb3ab52ddef145d25315ba03ca1e2178256b5147b7b52a043a2
|
@@ -37,6 +37,9 @@ module DaimonSkycrawlers
|
|
37
37
|
when headers.key?("etag") && page.etag
|
38
38
|
headers["etag"] != page.etag
|
39
39
|
when headers.key?("last-modified") && page.last_modified_at
|
40
|
+
if headers["last-modified"] < page.last_modified_at
|
41
|
+
log.warn("#{url} returns old contents. #{headers["last-modified"]} < #{page.last_modified_at}")
|
42
|
+
end
|
40
43
|
headers["last-modified"] > page.last_modified_at
|
41
44
|
else
|
42
45
|
true
|
@@ -27,7 +27,7 @@ module DaimonSkycrawlers
|
|
27
27
|
timestamps: true
|
28
28
|
}
|
29
29
|
invoke(MigrationGenerator, [
|
30
|
-
"
|
30
|
+
"CreatePages",
|
31
31
|
"url:string",
|
32
32
|
"headers:text",
|
33
33
|
"body:binary",
|
@@ -37,6 +37,22 @@ module DaimonSkycrawlers
|
|
37
37
|
migration_options)
|
38
38
|
end
|
39
39
|
|
40
|
+
def insert_index
|
41
|
+
Dir.glob(File.join(destination_root, name, "db/migrate/*_create_pages.rb")) do |entry|
|
42
|
+
source = File.read(entry)
|
43
|
+
replaced_source = source.gsub(/(^ +)t.timestamps$/) do |_match; indent|
|
44
|
+
indent = $1
|
45
|
+
<<-CODE.chomp
|
46
|
+
#{indent}t.timestamps
|
47
|
+
|
48
|
+
#{indent}t.index [:url]
|
49
|
+
#{indent}t.index [:url, :updated_at]
|
50
|
+
CODE
|
51
|
+
end
|
52
|
+
File.write(entry, replaced_source)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
40
56
|
def copy_files
|
41
57
|
[
|
42
58
|
"Gemfile",
|
@@ -9,7 +9,7 @@ module DaimonSkycrawlers
|
|
9
9
|
class RDB < Base
|
10
10
|
def initialize(config_path = "config/database.yml")
|
11
11
|
super()
|
12
|
-
Base.configurations = YAML.
|
12
|
+
Base.configurations = YAML.load(ERB.new(::File.read(config_path)).result)
|
13
13
|
Base.establish_connection(DaimonSkycrawlers.env.to_sym)
|
14
14
|
end
|
15
15
|
|
@@ -34,7 +34,7 @@ module DaimonSkycrawlers
|
|
34
34
|
# @param [String] url identity of the page
|
35
35
|
#
|
36
36
|
def find(url)
|
37
|
-
Page.where(url: url).order(
|
37
|
+
Page.where(url: url).order(updated_at: :desc).limit(1).first
|
38
38
|
end
|
39
39
|
|
40
40
|
class Base < ActiveRecord::Base
|
@@ -43,7 +43,7 @@ Rake::Task.define_task("db:load_config") do
|
|
43
43
|
config.migrations_paths = ["db/migrate"]
|
44
44
|
config.fixtures_path = "test/fixtures"
|
45
45
|
config.seed_loader = seed_loader.new
|
46
|
-
config.database_configuration = YAML.
|
46
|
+
config.database_configuration = YAML.load(ERB.new(::File.read("config/database.yml")).result)
|
47
47
|
end
|
48
48
|
environment = ENV["SKYCRAWLERS_ENV"] || "development"
|
49
49
|
ActiveRecord::Base.configurations = ActiveRecord::Tasks::DatabaseTasks.database_configuration
|
@@ -1,9 +1,8 @@
|
|
1
1
|
PATH
|
2
2
|
remote: ../../
|
3
3
|
specs:
|
4
|
-
daimon_skycrawlers (0.
|
4
|
+
daimon_skycrawlers (0.7.2)
|
5
5
|
activerecord
|
6
|
-
bundler (~> 1.11)
|
7
6
|
faraday
|
8
7
|
faraday_middleware
|
9
8
|
nokogiri
|
@@ -45,7 +44,7 @@ GEM
|
|
45
44
|
amq-protocol (2.0.1)
|
46
45
|
arel (7.1.4)
|
47
46
|
builder (3.2.2)
|
48
|
-
bunny (2.6.
|
47
|
+
bunny (2.6.1)
|
49
48
|
amq-protocol (>= 2.0.1)
|
50
49
|
concurrent-ruby (1.0.2)
|
51
50
|
erubis (2.7.0)
|
@@ -89,8 +88,8 @@ GEM
|
|
89
88
|
thread_safe (0.3.5)
|
90
89
|
timers (4.1.1)
|
91
90
|
hitimes
|
92
|
-
typhoeus (
|
93
|
-
ethon (>= 0.
|
91
|
+
typhoeus (1.1.0)
|
92
|
+
ethon (>= 0.9.0)
|
94
93
|
tzinfo (1.2.2)
|
95
94
|
thread_safe (~> 0.1)
|
96
95
|
webrobots (0.1.2)
|
data/sample/itp-crawler/db/migrate/{20161018044144_create_page.rb → 20161018044144_create_pages.rb}
RENAMED
@@ -1,13 +1,15 @@
|
|
1
|
-
class
|
1
|
+
class CreatePages < ActiveRecord::Migration[5.0]
|
2
2
|
def change
|
3
3
|
create_table :pages do |t|
|
4
|
-
t.string :url
|
4
|
+
t.string :url, index: true
|
5
5
|
t.text :headers
|
6
6
|
t.binary :body
|
7
7
|
t.datetime :last_modified_at
|
8
8
|
t.string :etag
|
9
9
|
|
10
10
|
t.timestamps
|
11
|
+
|
12
|
+
t.index [:url, :updated_at]
|
11
13
|
end
|
12
14
|
end
|
13
15
|
end
|
@@ -23,6 +23,8 @@ ActiveRecord::Schema.define(version: 20161018044144) do
|
|
23
23
|
t.string "etag"
|
24
24
|
t.datetime "created_at", null: false
|
25
25
|
t.datetime "updated_at", null: false
|
26
|
+
t.index ["url", "updated_at"], name: "index_pages_on_url_and_updated_at", using: :btree
|
27
|
+
t.index ["url"], name: "index_pages_on_url", using: :btree
|
26
28
|
end
|
27
29
|
|
28
30
|
end
|
@@ -1,13 +1,15 @@
|
|
1
1
|
class CreatePages < ActiveRecord::Migration
|
2
2
|
def change
|
3
3
|
create_table :pages do |t|
|
4
|
-
t.string :url
|
4
|
+
t.string :url, index: true
|
5
5
|
t.text :headers
|
6
6
|
t.binary :body, limit: 10 * 1024 ** 2 # 10MiB
|
7
7
|
t.datetime :last_modified_at
|
8
8
|
t.string :etag
|
9
9
|
|
10
10
|
t.timestamps null: false
|
11
|
+
|
12
|
+
t.index [:url, :updated_at]
|
11
13
|
end
|
12
14
|
end
|
13
15
|
end
|
data/sample/spider/db/schema.rb
CHANGED
@@ -23,6 +23,8 @@ ActiveRecord::Schema.define(version: 20160830155803) do
|
|
23
23
|
t.string "etag"
|
24
24
|
t.datetime "created_at", null: false
|
25
25
|
t.datetime "updated_at", null: false
|
26
|
+
t.index ["url", "updated_at"], name: "index_pages_on_url_and_updated_at", using: :btree
|
27
|
+
t.index ["url"], name: "index_pages_on_url", using: :btree
|
26
28
|
end
|
27
29
|
|
28
30
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- daimon developers
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -363,7 +363,7 @@ files:
|
|
363
363
|
- sample/itp-crawler/config/database.yml
|
364
364
|
- sample/itp-crawler/config/database_itp.yml
|
365
365
|
- sample/itp-crawler/config/init.rb
|
366
|
-
- sample/itp-crawler/db/migrate/
|
366
|
+
- sample/itp-crawler/db/migrate/20161018044144_create_pages.rb
|
367
367
|
- sample/itp-crawler/db/schema.rb
|
368
368
|
- sample/itp-crawler/db_itp/migrate/20161020044144_create_shop.rb
|
369
369
|
- sample/itp-crawler/db_itp/schema.rb
|