daimon_skycrawlers 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/daimon_skycrawlers.gemspec +1 -1
- data/lib/daimon_skycrawlers/commands/enqueue.rb +7 -4
- data/lib/daimon_skycrawlers/crawler/base.rb +2 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +0 -2
- data/lib/daimon_skycrawlers/crawler.rb +1 -0
- data/lib/daimon_skycrawlers/processor.rb +27 -0
- data/lib/daimon_skycrawlers/sitemap_parser.rb +79 -0
- data/lib/daimon_skycrawlers/storage/rdb.rb +7 -4
- data/lib/daimon_skycrawlers/tasks/database_tasks.rake +10 -11
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +7 -0
- data/sample/itp-crawler/Gemfile +4 -0
- data/sample/itp-crawler/Gemfile.lock +106 -0
- data/sample/itp-crawler/README.md +50 -0
- data/sample/itp-crawler/Rakefile +2 -0
- data/sample/itp-crawler/app/crawlers/itp_crawler.rb +8 -0
- data/sample/itp-crawler/app/models/itp_base.rb +5 -0
- data/sample/itp-crawler/app/models/itp_shop.rb +5 -0
- data/sample/itp-crawler/app/processors/itp_processor.rb +95 -0
- data/sample/itp-crawler/config/database.yml +26 -0
- data/sample/itp-crawler/config/database_itp.yml +26 -0
- data/sample/itp-crawler/config/init.rb +22 -0
- data/sample/itp-crawler/db/migrate/20161018044144_create_page.rb +13 -0
- data/sample/itp-crawler/db/schema.rb +28 -0
- data/sample/itp-crawler/db_itp/migrate/20161020044144_create_shop.rb +14 -0
- data/sample/itp-crawler/db_itp/schema.rb +31 -0
- data/sample/itp-crawler/lib/tasks/database_tasks.rb +66 -0
- data/sample/itp-crawler/lib/tasks.rb +1 -0
- metadata +21 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e81cffca8abac23dba0da166eb2a1f6cef80b2c1
|
|
4
|
+
data.tar.gz: d4948c33c37ee8d9508d93c037919a784a2d21cc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4840d12acec8c75a13330810029a32c62827e35a5ca871b06dec0b1f80e7be9b8dd9fa08208a4b0daac4cd527a32ba1bdc1c893c815834ebbfe22750fd440997
|
|
7
|
+
data.tar.gz: 072f215aa3f245445c9ca335651fe875c238039978f3cf0eac6bc873b0d34b1c1dd4626cfa2320ce4ac5e9b1defe05ffba8ec63dcf9e5c8197472c2fa5cd9cea
|
data/daimon_skycrawlers.gemspec
CHANGED
|
@@ -28,7 +28,7 @@ Gem::Specification.new do |spec|
|
|
|
28
28
|
spec.add_dependency "railties"
|
|
29
29
|
spec.add_dependency "pg"
|
|
30
30
|
spec.add_dependency "timers"
|
|
31
|
-
spec.add_dependency "
|
|
31
|
+
spec.add_dependency "typhoeus"
|
|
32
32
|
spec.add_dependency "webrobots"
|
|
33
33
|
|
|
34
34
|
spec.add_development_dependency "rake", "~> 10.0"
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
require "daimon_skycrawlers"
|
|
2
2
|
require "daimon_skycrawlers/crawler"
|
|
3
3
|
require "daimon_skycrawlers/processor"
|
|
4
|
+
require "daimon_skycrawlers/sitemap_parser"
|
|
4
5
|
require "daimon_skycrawlers/version"
|
|
5
|
-
require "sitemap-parser"
|
|
6
6
|
require "webrobots"
|
|
7
7
|
|
|
8
8
|
module DaimonSkycrawlers
|
|
@@ -27,6 +27,7 @@ module DaimonSkycrawlers
|
|
|
27
27
|
desc "sitemap [OPTIONS] URL", "Enqueue URLs from simtemap.xml"
|
|
28
28
|
method_option("robots-txt", aliases: ["-r"], type: :boolean,
|
|
29
29
|
desc: "URL for robots.txt. Detect robots.txt automatically if URL is not robots.txt")
|
|
30
|
+
method_option("dump", type: :boolean, desc: "Dump URLs without enqueue")
|
|
30
31
|
def sitemap(url)
|
|
31
32
|
load_init
|
|
32
33
|
if options["robots-txt"]
|
|
@@ -35,9 +36,11 @@ module DaimonSkycrawlers
|
|
|
35
36
|
else
|
|
36
37
|
sitemaps = [url]
|
|
37
38
|
end
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
39
|
+
sitemap_parser = DaimonSkycrawlers::SitemapParser.new(sitemaps)
|
|
40
|
+
urls = sitemap_parser.parse
|
|
41
|
+
if options["dump"]
|
|
42
|
+
puts urls.join("\n")
|
|
43
|
+
return
|
|
41
44
|
end
|
|
42
45
|
urls.each do |_url|
|
|
43
46
|
DaimonSkycrawlers::Crawler.enqueue_url(_url)
|
|
@@ -5,6 +5,8 @@ require "daimon_skycrawlers/logger"
|
|
|
5
5
|
require "daimon_skycrawlers/config"
|
|
6
6
|
require "daimon_skycrawlers/storage"
|
|
7
7
|
require "daimon_skycrawlers/processor"
|
|
8
|
+
require "daimon_skycrawlers/filter/update_checker"
|
|
9
|
+
require "daimon_skycrawlers/filter/robots_txt_checker"
|
|
8
10
|
|
|
9
11
|
module DaimonSkycrawlers
|
|
10
12
|
module Crawler
|
|
@@ -6,24 +6,51 @@ require "daimon_skycrawlers/consumer/http_response"
|
|
|
6
6
|
module DaimonSkycrawlers
|
|
7
7
|
module Processor
|
|
8
8
|
class << self
|
|
9
|
+
#
|
|
10
|
+
# Run registered processors
|
|
11
|
+
#
|
|
12
|
+
# @param process_name [String] Process name
|
|
13
|
+
#
|
|
9
14
|
def run(process_name: default_process_name)
|
|
10
15
|
DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
|
|
11
16
|
SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::HTTPResponse]).run
|
|
12
17
|
end
|
|
13
18
|
|
|
19
|
+
#
|
|
20
|
+
# Enqueue a URL to processor queue
|
|
21
|
+
#
|
|
22
|
+
# @param [String] Specify absolute URL
|
|
23
|
+
# @param [Hash] Extra parameters for crawler
|
|
24
|
+
# @return [void]
|
|
14
25
|
def enqueue_http_response(url, message = {})
|
|
15
26
|
message[:url] = url
|
|
27
|
+
config.logger.debug("#{queue_name}: #{url}")
|
|
16
28
|
SongkickQueue.publish(queue_name, message)
|
|
17
29
|
end
|
|
18
30
|
|
|
31
|
+
#
|
|
32
|
+
# Shortcut of DaimonSkycrawlers.configuration
|
|
33
|
+
#
|
|
34
|
+
# @return [DaimonSkycrawlers::Configuration]
|
|
35
|
+
#
|
|
19
36
|
def config
|
|
20
37
|
DaimonSkycrawlers.configuration
|
|
21
38
|
end
|
|
22
39
|
|
|
40
|
+
#
|
|
41
|
+
# Queue name for processor
|
|
42
|
+
#
|
|
43
|
+
# @return [String] Queue name
|
|
44
|
+
#
|
|
23
45
|
def queue_name
|
|
24
46
|
"#{config.queue_name_prefix}.http-response"
|
|
25
47
|
end
|
|
26
48
|
|
|
49
|
+
#
|
|
50
|
+
# Default process name
|
|
51
|
+
#
|
|
52
|
+
# @return [String] Default process name
|
|
53
|
+
#
|
|
27
54
|
def default_process_name
|
|
28
55
|
"#{config.queue_name_prefix}:http-response"
|
|
29
56
|
end
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
require "nokogiri"
|
|
2
|
+
require "typhoeus"
|
|
3
|
+
require "zlib"
|
|
4
|
+
require "uri"
|
|
5
|
+
|
|
6
|
+
module DaimonSkycrawlers
|
|
7
|
+
# Based on https://github.com/benbalter/sitemap-parser
|
|
8
|
+
class SitemapParser
|
|
9
|
+
def initialize(urls, options = {})
|
|
10
|
+
@urls = urls
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def parse
|
|
14
|
+
hydra = Typhoeus::Hydra.new(max_concurrency: 1)
|
|
15
|
+
sitemap_urls = []
|
|
16
|
+
@urls.each do |url|
|
|
17
|
+
if URI(url).scheme.start_with?("http")
|
|
18
|
+
request = Typhoeus::Request.new(url, followlocation: true)
|
|
19
|
+
request.on_complete do |response|
|
|
20
|
+
sitemap_urls.concat(on_complete(response))
|
|
21
|
+
end
|
|
22
|
+
hydra.queue(request)
|
|
23
|
+
else
|
|
24
|
+
if File.exist?(url)
|
|
25
|
+
extract_urls(File.read(url))
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
hydra.run
|
|
30
|
+
sitemap_urls
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def on_complete(response)
|
|
36
|
+
raise "HTTP requset to #{response.effective_url} failed" unless response.success?
|
|
37
|
+
raw_sitemap = inflate_response(response)
|
|
38
|
+
extract_urls(raw_sitemap)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def extract_urls(body)
|
|
42
|
+
sitemap = Nokogiri::XML(body)
|
|
43
|
+
case
|
|
44
|
+
when sitemap.at("sitemapindex")
|
|
45
|
+
urls = sitemap.search("sitemap").flat_map do |s|
|
|
46
|
+
s.at("loc").content
|
|
47
|
+
end
|
|
48
|
+
SitemapParser.new(urls).parse
|
|
49
|
+
when sitemap.at("urlset")
|
|
50
|
+
sitemap.search("url").flat_map do |url|
|
|
51
|
+
url.at("loc").content
|
|
52
|
+
end
|
|
53
|
+
else
|
|
54
|
+
raise "Malformed sitemap.xml no <sitemapindex> or <urlset>"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def inflate_response(response)
|
|
59
|
+
if compressed?(response)
|
|
60
|
+
# We cannot inflate compressed data from NTFS filesystem (NT).
|
|
61
|
+
# This can avoid errors
|
|
62
|
+
stream = Zlib::Inflate.new(Zlib::MAX_WBITS + 32)
|
|
63
|
+
stream.inflate(response.body)
|
|
64
|
+
else
|
|
65
|
+
response.body
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def compressed?(response)
|
|
70
|
+
case response.headers["Content-Encoding"]&.downcase
|
|
71
|
+
when "deflate", "gzip", "x-gzip"
|
|
72
|
+
true
|
|
73
|
+
else
|
|
74
|
+
signature = response.body[0, 2]
|
|
75
|
+
signature == "\x1F\x8B".b
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
@@ -9,9 +9,8 @@ module DaimonSkycrawlers
|
|
|
9
9
|
class RDB < Base
|
|
10
10
|
def initialize(config_path = "config/database.yml")
|
|
11
11
|
super()
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
ActiveRecord::Base.establish_connection(config[environment])
|
|
12
|
+
Base.configurations = YAML.load_file(config_path)
|
|
13
|
+
Base.establish_connection(DaimonSkycrawlers.env.to_sym)
|
|
15
14
|
end
|
|
16
15
|
|
|
17
16
|
#
|
|
@@ -38,7 +37,11 @@ module DaimonSkycrawlers
|
|
|
38
37
|
Page.where(url: url).order(last_modified_at: :desc).limit(1).first
|
|
39
38
|
end
|
|
40
39
|
|
|
41
|
-
class
|
|
40
|
+
class Base < ActiveRecord::Base
|
|
41
|
+
self.abstract_class = true
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
class Page < Base
|
|
42
45
|
self.table_name = "pages"
|
|
43
46
|
end
|
|
44
47
|
end
|
|
@@ -30,24 +30,23 @@ seed_loader = Class.new do
|
|
|
30
30
|
end
|
|
31
31
|
end
|
|
32
32
|
|
|
33
|
-
ActiveRecord::Tasks::DatabaseTasks.tap do |config|
|
|
34
|
-
config.root = Rake.application.original_dir
|
|
35
|
-
config.env = ENV["SKYCRAWLERS_ENV"] || "development"
|
|
36
|
-
config.db_dir = "db"
|
|
37
|
-
config.migrations_paths = ["db/migrate"]
|
|
38
|
-
config.fixtures_path = "test/fixtures"
|
|
39
|
-
config.seed_loader = seed_loader.new
|
|
40
|
-
config.database_configuration = ActiveRecord::Base.configurations
|
|
41
|
-
end
|
|
42
|
-
|
|
43
33
|
# db:load_config can be overriden manually
|
|
44
34
|
Rake::Task["db:seed"].enhance(["db:load_config"])
|
|
45
35
|
Rake::Task["db:load_config"].clear
|
|
46
36
|
|
|
47
37
|
Rake::Task.define_task("db:environment")
|
|
48
38
|
Rake::Task.define_task("db:load_config") do
|
|
49
|
-
ActiveRecord::
|
|
39
|
+
ActiveRecord::Tasks::DatabaseTasks.tap do |config|
|
|
40
|
+
config.root = Rake.application.original_dir
|
|
41
|
+
config.env = ENV["SKYCRAWLERS_ENV"] || "development"
|
|
42
|
+
config.db_dir = "db"
|
|
43
|
+
config.migrations_paths = ["db/migrate"]
|
|
44
|
+
config.fixtures_path = "test/fixtures"
|
|
45
|
+
config.seed_loader = seed_loader.new
|
|
46
|
+
config.database_configuration = YAML.load_file("config/database.yml")
|
|
47
|
+
end
|
|
50
48
|
environment = ENV["SKYCRAWLERS_ENV"] || "development"
|
|
49
|
+
ActiveRecord::Base.configurations = ActiveRecord::Tasks::DatabaseTasks.database_configuration
|
|
51
50
|
ActiveRecord::Base.establish_connection(environment.to_sym)
|
|
52
51
|
end
|
|
53
52
|
Rake::Task["db:test:deprecated"].clear if Rake::Task.task_defined?("db:test:deprecated")
|
data/lib/daimon_skycrawlers.rb
CHANGED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: ../../
|
|
3
|
+
specs:
|
|
4
|
+
daimon_skycrawlers (0.6.0)
|
|
5
|
+
activerecord
|
|
6
|
+
bundler (~> 1.11)
|
|
7
|
+
faraday
|
|
8
|
+
faraday_middleware
|
|
9
|
+
nokogiri
|
|
10
|
+
pg
|
|
11
|
+
railties
|
|
12
|
+
songkick_queue
|
|
13
|
+
thor
|
|
14
|
+
timers
|
|
15
|
+
typhoeus
|
|
16
|
+
webrobots
|
|
17
|
+
|
|
18
|
+
GEM
|
|
19
|
+
remote: https://rubygems.org/
|
|
20
|
+
specs:
|
|
21
|
+
actionpack (5.0.0.1)
|
|
22
|
+
actionview (= 5.0.0.1)
|
|
23
|
+
activesupport (= 5.0.0.1)
|
|
24
|
+
rack (~> 2.0)
|
|
25
|
+
rack-test (~> 0.6.3)
|
|
26
|
+
rails-dom-testing (~> 2.0)
|
|
27
|
+
rails-html-sanitizer (~> 1.0, >= 1.0.2)
|
|
28
|
+
actionview (5.0.0.1)
|
|
29
|
+
activesupport (= 5.0.0.1)
|
|
30
|
+
builder (~> 3.1)
|
|
31
|
+
erubis (~> 2.7.0)
|
|
32
|
+
rails-dom-testing (~> 2.0)
|
|
33
|
+
rails-html-sanitizer (~> 1.0, >= 1.0.2)
|
|
34
|
+
activemodel (5.0.0.1)
|
|
35
|
+
activesupport (= 5.0.0.1)
|
|
36
|
+
activerecord (5.0.0.1)
|
|
37
|
+
activemodel (= 5.0.0.1)
|
|
38
|
+
activesupport (= 5.0.0.1)
|
|
39
|
+
arel (~> 7.0)
|
|
40
|
+
activesupport (5.0.0.1)
|
|
41
|
+
concurrent-ruby (~> 1.0, >= 1.0.2)
|
|
42
|
+
i18n (~> 0.7)
|
|
43
|
+
minitest (~> 5.1)
|
|
44
|
+
tzinfo (~> 1.1)
|
|
45
|
+
amq-protocol (2.0.1)
|
|
46
|
+
arel (7.1.4)
|
|
47
|
+
builder (3.2.2)
|
|
48
|
+
bunny (2.6.0)
|
|
49
|
+
amq-protocol (>= 2.0.1)
|
|
50
|
+
concurrent-ruby (1.0.2)
|
|
51
|
+
erubis (2.7.0)
|
|
52
|
+
ethon (0.9.1)
|
|
53
|
+
ffi (>= 1.3.0)
|
|
54
|
+
faraday (0.9.2)
|
|
55
|
+
multipart-post (>= 1.2, < 3)
|
|
56
|
+
faraday_middleware (0.10.0)
|
|
57
|
+
faraday (>= 0.7.4, < 0.10)
|
|
58
|
+
ffi (1.9.14)
|
|
59
|
+
hitimes (1.2.4)
|
|
60
|
+
i18n (0.7.0)
|
|
61
|
+
loofah (2.0.3)
|
|
62
|
+
nokogiri (>= 1.5.9)
|
|
63
|
+
method_source (0.8.2)
|
|
64
|
+
mini_portile2 (2.1.0)
|
|
65
|
+
minitest (5.9.1)
|
|
66
|
+
multipart-post (2.0.0)
|
|
67
|
+
nokogiri (1.6.8.1)
|
|
68
|
+
mini_portile2 (~> 2.1.0)
|
|
69
|
+
pg (0.19.0)
|
|
70
|
+
rack (2.0.1)
|
|
71
|
+
rack-test (0.6.3)
|
|
72
|
+
rack (>= 1.0)
|
|
73
|
+
rails-dom-testing (2.0.1)
|
|
74
|
+
activesupport (>= 4.2.0, < 6.0)
|
|
75
|
+
nokogiri (~> 1.6.0)
|
|
76
|
+
rails-html-sanitizer (1.0.3)
|
|
77
|
+
loofah (~> 2.0)
|
|
78
|
+
railties (5.0.0.1)
|
|
79
|
+
actionpack (= 5.0.0.1)
|
|
80
|
+
activesupport (= 5.0.0.1)
|
|
81
|
+
method_source
|
|
82
|
+
rake (>= 0.8.7)
|
|
83
|
+
thor (>= 0.18.1, < 2.0)
|
|
84
|
+
rake (11.3.0)
|
|
85
|
+
songkick_queue (1.0.0)
|
|
86
|
+
activesupport (>= 3.0.0)
|
|
87
|
+
bunny (~> 2.2)
|
|
88
|
+
thor (0.19.1)
|
|
89
|
+
thread_safe (0.3.5)
|
|
90
|
+
timers (4.1.1)
|
|
91
|
+
hitimes
|
|
92
|
+
typhoeus (0.8.0)
|
|
93
|
+
ethon (>= 0.8.0)
|
|
94
|
+
tzinfo (1.2.2)
|
|
95
|
+
thread_safe (~> 0.1)
|
|
96
|
+
webrobots (0.1.2)
|
|
97
|
+
|
|
98
|
+
PLATFORMS
|
|
99
|
+
ruby
|
|
100
|
+
|
|
101
|
+
DEPENDENCIES
|
|
102
|
+
daimon_skycrawlers!
|
|
103
|
+
rake
|
|
104
|
+
|
|
105
|
+
BUNDLED WITH
|
|
106
|
+
1.12.5
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# itp-crawler
|
|
2
|
+
|
|
3
|
+
Simple crawler for [iタウンページ](http://itp.ne.jp)
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
- Ruby
|
|
8
|
+
- RabbitMQ
|
|
9
|
+
- RDB
|
|
10
|
+
- PostgreSQL (default)
|
|
11
|
+
- MySQL
|
|
12
|
+
- SQLite3
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
1. Install dependencies
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
$ bundle install
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
2. Create database
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
$ bundle exec rake db:create
|
|
26
|
+
$ bundle exec rake db:migrate
|
|
27
|
+
$ bundle exec rake itp:db:create
|
|
28
|
+
$ bundle exec rake itp:db:migrate
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
3. Open new terminal and run crawler/processor
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
$ bundle exec daimon_skycrawlers exec crawler # on new terminal
|
|
35
|
+
$ bundle exec daimon_skycrawlers exec processor # on new terminal
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
4. Enqueue task
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
$ bundle exec daimon_skycrawlers enqueue url "http://itp.ne.jp/osaka/genre_dir/niku/?num=50"
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
|
45
|
+
|
|
46
|
+
6. You can re-enqueue task for processor
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
$ bundle exec daimon_skycrawlers enqueue response "http://itp.ne.jp/osaka/genre_dir/niku/?num=50"
|
|
50
|
+
```
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
require "daimon_skycrawlers"
|
|
2
|
+
require "daimon_skycrawlers/processor"
|
|
3
|
+
require "daimon_skycrawlers/processor/base"
|
|
4
|
+
|
|
5
|
+
require_relative "../models/itp_shop"
|
|
6
|
+
|
|
7
|
+
class ItpProcessor < DaimonSkycrawlers::Processor::Base
|
|
8
|
+
def call(message)
|
|
9
|
+
key_url = message[:url]
|
|
10
|
+
page = storage.find(key_url)
|
|
11
|
+
@doc = Nokogiri::HTML(page.body.encode("UTF-8", "CP932"))
|
|
12
|
+
ItpShop.transaction do
|
|
13
|
+
prepare_shops do |shop|
|
|
14
|
+
itp_shop = ItpShop.find_or_initialize_by(itp_url: shop.itp_url)
|
|
15
|
+
itp_shop.assign_attributes(shop.to_h)
|
|
16
|
+
itp_shop.save!
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
unless %r(/pg/) =~ key_url
|
|
20
|
+
enqueue_pages(key_url)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
Shop = Struct.new(:name, :description, :itp_url, :zip_code, :address, :phone)
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def prepare_shops
|
|
29
|
+
@doc.search(".normalResultsBox").each do |shop|
|
|
30
|
+
begin
|
|
31
|
+
name_element = shop.at("section h4 .blueText") || shop.at("section h4 .brackText")
|
|
32
|
+
name = name_element.content.strip
|
|
33
|
+
description = shop.at("section p").content.strip
|
|
34
|
+
# Avoid false detection for shop address
|
|
35
|
+
if description.start_with?("住所")
|
|
36
|
+
description = nil
|
|
37
|
+
end
|
|
38
|
+
itp_path = shop.at("section h4 a").attr("href")
|
|
39
|
+
phone = shop.at("section p b").content.strip
|
|
40
|
+
address_element = shop.search("section p").detect do |element|
|
|
41
|
+
/住所/ =~ element.content
|
|
42
|
+
end
|
|
43
|
+
address_element.search("span").unlink
|
|
44
|
+
address_element.search("a").unlink
|
|
45
|
+
address_text = address_element.content.strip
|
|
46
|
+
zip_code = address_text.slice(/〒(\d{3}-\d{4})(.+)/, 1)
|
|
47
|
+
address = address_text.slice(/〒(\d{3}-\d{4})(.+)/, 2).sub(/\A[[:space:]]+/, "")
|
|
48
|
+
s = Shop.new(name,
|
|
49
|
+
description,
|
|
50
|
+
retrieve_individual_page_url(itp_path),
|
|
51
|
+
zip_code,
|
|
52
|
+
address,
|
|
53
|
+
phone)
|
|
54
|
+
yield s
|
|
55
|
+
rescue => e
|
|
56
|
+
log.warn("#{e.class}: #{e.message}")
|
|
57
|
+
log.debug(e.backtrace.join("\n"))
|
|
58
|
+
break
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# NOTE: HEAD request to itp.ne.jp is so slow
|
|
64
|
+
#
|
|
65
|
+
# /shop/KN2700060500184274/?url=%2F0663026300%2F&s_bid=KN2700060500184274&s_sid=FSP-LSR-002&s_fr=V01&s_ck=C01
|
|
66
|
+
# http://nttbj.itp.ne.jp/0663026300/index.html
|
|
67
|
+
#
|
|
68
|
+
# or
|
|
69
|
+
#
|
|
70
|
+
# /shop/KN2700060500039708/
|
|
71
|
+
# http://itp.ne.jp/shop/KN2700060500039708/
|
|
72
|
+
def retrieve_individual_page_url(path)
|
|
73
|
+
shop_id = path.slice(/\/\?url=(.+)&/, 1)
|
|
74
|
+
uri = if shop_id
|
|
75
|
+
URI("http://nttbj.itp.ne.jp/") + URI.unescape(shop_id) + "index.html"
|
|
76
|
+
else
|
|
77
|
+
URI("http://itp.ne.jp/") + path
|
|
78
|
+
end
|
|
79
|
+
uri.to_s
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
MAX_PAGE_NUM = 100
|
|
83
|
+
|
|
84
|
+
def enqueue_pages(base_url)
|
|
85
|
+
search_result = @doc.at("h1.searchResultHeader").content.strip.slice(/(\d+)件/, 1).to_i
|
|
86
|
+
# itp.ne.jp can displays 5000 search results.
|
|
87
|
+
2.upto([(search_result / 50), MAX_PAGE_NUM].min) do |n|
|
|
88
|
+
url = URI.join(base_url, "pg/#{n}/?num=50")
|
|
89
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url.to_s)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
processor = ItpProcessor.new
|
|
95
|
+
DaimonSkycrawlers.register_processor(processor)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# PostgreSQL. Versions 8.2 and up are supported.
|
|
2
|
+
#
|
|
3
|
+
default: &default
|
|
4
|
+
adapter: postgresql
|
|
5
|
+
encoding: unicode
|
|
6
|
+
pool: 5
|
|
7
|
+
|
|
8
|
+
development:
|
|
9
|
+
<<: *default
|
|
10
|
+
database: itp-crawler_development
|
|
11
|
+
#username: itp-crawler
|
|
12
|
+
#password:
|
|
13
|
+
#host: localhost
|
|
14
|
+
#port: 5432
|
|
15
|
+
#schema_search_path: myapp,sharedapp,public
|
|
16
|
+
#min_messages: notice
|
|
17
|
+
|
|
18
|
+
test:
|
|
19
|
+
<<: *default
|
|
20
|
+
database: itp-crawler_test
|
|
21
|
+
|
|
22
|
+
production:
|
|
23
|
+
<<: *default
|
|
24
|
+
database: itp-crawler_production
|
|
25
|
+
username: itp-crawler
|
|
26
|
+
password: <%= ENV['ITP-CRAWLER_PASSWORD'] %>
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# PostgreSQL. Versions 8.2 and up are supported.
|
|
2
|
+
#
|
|
3
|
+
default: &default
|
|
4
|
+
adapter: postgresql
|
|
5
|
+
encoding: unicode
|
|
6
|
+
pool: 5
|
|
7
|
+
|
|
8
|
+
development:
|
|
9
|
+
<<: *default
|
|
10
|
+
database: itp-processor_development
|
|
11
|
+
#username: itp-processor
|
|
12
|
+
#password:
|
|
13
|
+
#host: localhost
|
|
14
|
+
#port: 5432
|
|
15
|
+
#schema_search_path: myapp,sharedapp,public
|
|
16
|
+
#min_messages: notice
|
|
17
|
+
|
|
18
|
+
test:
|
|
19
|
+
<<: *default
|
|
20
|
+
database: itp-processor_test
|
|
21
|
+
|
|
22
|
+
production:
|
|
23
|
+
<<: *default
|
|
24
|
+
database: itp-processor_production
|
|
25
|
+
username: itp-processor
|
|
26
|
+
password: <%= ENV['ITP-PROCESSOR_PASSWORD'] %>
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
require "bundler/setup"
|
|
2
|
+
require "daimon_skycrawlers"
|
|
3
|
+
require "daimon_skycrawlers/logger"
|
|
4
|
+
require "daimon_skycrawlers/queue"
|
|
5
|
+
|
|
6
|
+
DaimonSkycrawlers.configure do |config|
|
|
7
|
+
config.logger = DaimonSkycrawlers::Logger.default
|
|
8
|
+
config.crawler_interval = 1
|
|
9
|
+
config.shutdown_interval = 30
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
DaimonSkycrawlers::Queue.configure do |config|
|
|
13
|
+
# queue configuration
|
|
14
|
+
config.logger = DaimonSkycrawlers.configuration.logger
|
|
15
|
+
config.host = "127.0.0.1"
|
|
16
|
+
config.port = 5672
|
|
17
|
+
# config.username = 'guest'
|
|
18
|
+
# config.password = 'guest'
|
|
19
|
+
config.vhost = "/"
|
|
20
|
+
config.max_reconnect_attempts = 10
|
|
21
|
+
config.network_recovery_interval = 1.0
|
|
22
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# This file is auto-generated from the current state of the database. Instead
|
|
2
|
+
# of editing this file, please use the migrations feature of Active Record to
|
|
3
|
+
# incrementally modify your database, and then regenerate this schema definition.
|
|
4
|
+
#
|
|
5
|
+
# Note that this schema.rb definition is the authoritative source for your
|
|
6
|
+
# database schema. If you need to create the application database on another
|
|
7
|
+
# system, you should be using db:schema:load, not running all the migrations
|
|
8
|
+
# from scratch. The latter is a flawed and unsustainable approach (the more migrations
|
|
9
|
+
# you'll amass, the slower it'll run and the greater likelihood for issues).
|
|
10
|
+
#
|
|
11
|
+
# It's strongly recommended that you check this file into your version control system.
|
|
12
|
+
|
|
13
|
+
ActiveRecord::Schema.define(version: 20161018044144) do
|
|
14
|
+
|
|
15
|
+
# These are extensions that must be enabled in order to support this database
|
|
16
|
+
enable_extension "plpgsql"
|
|
17
|
+
|
|
18
|
+
create_table "pages", force: :cascade do |t|
|
|
19
|
+
t.string "url"
|
|
20
|
+
t.text "headers"
|
|
21
|
+
t.binary "body"
|
|
22
|
+
t.datetime "last_modified_at"
|
|
23
|
+
t.string "etag"
|
|
24
|
+
t.datetime "created_at", null: false
|
|
25
|
+
t.datetime "updated_at", null: false
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
class CreateShop < ActiveRecord::Migration[5.0]
|
|
2
|
+
def change
|
|
3
|
+
create_table :shops do |t|
|
|
4
|
+
t.string :name, index: true
|
|
5
|
+
t.text :description
|
|
6
|
+
t.string :itp_url, index: true
|
|
7
|
+
t.string :zip_code
|
|
8
|
+
t.string :address
|
|
9
|
+
t.string :phone
|
|
10
|
+
|
|
11
|
+
t.timestamps
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# This file is auto-generated from the current state of the database. Instead
|
|
2
|
+
# of editing this file, please use the migrations feature of Active Record to
|
|
3
|
+
# incrementally modify your database, and then regenerate this schema definition.
|
|
4
|
+
#
|
|
5
|
+
# Note that this schema.rb definition is the authoritative source for your
|
|
6
|
+
# database schema. If you need to create the application database on another
|
|
7
|
+
# system, you should be using db:schema:load, not running all the migrations
|
|
8
|
+
# from scratch. The latter is a flawed and unsustainable approach (the more migrations
|
|
9
|
+
# you'll amass, the slower it'll run and the greater likelihood for issues).
|
|
10
|
+
#
|
|
11
|
+
# It's strongly recommended that you check this file into your version control system.
|
|
12
|
+
|
|
13
|
+
ActiveRecord::Schema.define(version: 20161020044144) do
|
|
14
|
+
|
|
15
|
+
# These are extensions that must be enabled in order to support this database
|
|
16
|
+
enable_extension "plpgsql"
|
|
17
|
+
|
|
18
|
+
create_table "shops", force: :cascade do |t|
|
|
19
|
+
t.string "name"
|
|
20
|
+
t.text "description"
|
|
21
|
+
t.string "itp_url"
|
|
22
|
+
t.string "zip_code"
|
|
23
|
+
t.string "address"
|
|
24
|
+
t.string "phone"
|
|
25
|
+
t.datetime "created_at", null: false
|
|
26
|
+
t.datetime "updated_at", null: false
|
|
27
|
+
t.index ["itp_url"], name: "index_shops_on_itp_url", using: :btree
|
|
28
|
+
t.index ["name"], name: "index_shops_on_name", using: :btree
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
require "daimon_skycrawlers"
|
|
2
|
+
|
|
3
|
+
seed_loader = Class.new do
|
|
4
|
+
def load_seed
|
|
5
|
+
# load "#{ActiveRecord::Tasks::DatabaseTasks.db_dir}/seeds.rb"
|
|
6
|
+
end
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
namespace :itp do
|
|
10
|
+
namespace :db do |ns|
|
|
11
|
+
task :drop => [:load_config] do
|
|
12
|
+
ActiveRecord::Tasks::DatabaseTasks.drop_current
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
task :create => [:load_config] do
|
|
16
|
+
ActiveRecord::Tasks::DatabaseTasks.create_current
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
task :migrate => [:load_config] do
|
|
20
|
+
ActiveRecord::Tasks::DatabaseTasks.migrate
|
|
21
|
+
ns["_dump"].invoke
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
task :_dump do
|
|
25
|
+
ns["schema:dump"].invoke
|
|
26
|
+
ns["_dump"].reenable
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
task :version => [:load_config] do
|
|
30
|
+
puts "Current version: #{ActiveRecord::Migrator.current_version}"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
namespace :migrate do
|
|
34
|
+
task :reset => ["db:drop", "db:create", "db:migrate"]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
namespace :schema do
|
|
38
|
+
task :dump => [:load_config] do
|
|
39
|
+
require "active_record/schema_dumper"
|
|
40
|
+
filename = ENV["SCHEMA"] || File.join(ActiveRecord::Tasks::DatabaseTasks.db_dir, "schema.rb")
|
|
41
|
+
File.open(filename, "w:utf-8") do |file|
|
|
42
|
+
ActiveRecord::SchemaDumper.dump(ActiveRecord::Base.connection, file)
|
|
43
|
+
end
|
|
44
|
+
ns["schema:dump"].reenable
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
task :load => [:load_config] do
|
|
48
|
+
ActiveRecord::Tasks::DatabaseTasks.load_schema_current(:ruby, ENV["SCHEMA"])
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
task :load_config do
|
|
53
|
+
ActiveRecord::Tasks::DatabaseTasks.tap do |config|
|
|
54
|
+
config.root = Rake.application.original_dir
|
|
55
|
+
config.env = ENV["SKYCRAWLERS_ENV"] || "development"
|
|
56
|
+
config.db_dir = "db_itp"
|
|
57
|
+
config.migrations_paths = ["db_itp/migrate"]
|
|
58
|
+
config.fixtures_path = "test/fixtures"
|
|
59
|
+
config.seed_loader = seed_loader.new
|
|
60
|
+
config.database_configuration = YAML.load_file("config/database_itp.yml")
|
|
61
|
+
end
|
|
62
|
+
ActiveRecord::Base.configurations = YAML.load_file("config/database_itp.yml")
|
|
63
|
+
ActiveRecord::Base.establish_connection(DaimonSkycrawlers.env.to_sym)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require_relative "./tasks/database_tasks"
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: daimon_skycrawlers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ryunosuke SATO
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-10-
|
|
11
|
+
date: 2016-10-25 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -151,7 +151,7 @@ dependencies:
|
|
|
151
151
|
- !ruby/object:Gem::Version
|
|
152
152
|
version: '0'
|
|
153
153
|
- !ruby/object:Gem::Dependency
|
|
154
|
-
name:
|
|
154
|
+
name: typhoeus
|
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
|
156
156
|
requirements:
|
|
157
157
|
- - ">="
|
|
@@ -343,6 +343,7 @@ files:
|
|
|
343
343
|
- lib/daimon_skycrawlers/processor/default.rb
|
|
344
344
|
- lib/daimon_skycrawlers/processor/spider.rb
|
|
345
345
|
- lib/daimon_skycrawlers/queue.rb
|
|
346
|
+
- lib/daimon_skycrawlers/sitemap_parser.rb
|
|
346
347
|
- lib/daimon_skycrawlers/storage.rb
|
|
347
348
|
- lib/daimon_skycrawlers/storage/base.rb
|
|
348
349
|
- lib/daimon_skycrawlers/storage/file.rb
|
|
@@ -352,6 +353,23 @@ files:
|
|
|
352
353
|
- lib/daimon_skycrawlers/tasks/database_tasks.rake
|
|
353
354
|
- lib/daimon_skycrawlers/timer.rb
|
|
354
355
|
- lib/daimon_skycrawlers/version.rb
|
|
356
|
+
- sample/itp-crawler/Gemfile
|
|
357
|
+
- sample/itp-crawler/Gemfile.lock
|
|
358
|
+
- sample/itp-crawler/README.md
|
|
359
|
+
- sample/itp-crawler/Rakefile
|
|
360
|
+
- sample/itp-crawler/app/crawlers/itp_crawler.rb
|
|
361
|
+
- sample/itp-crawler/app/models/itp_base.rb
|
|
362
|
+
- sample/itp-crawler/app/models/itp_shop.rb
|
|
363
|
+
- sample/itp-crawler/app/processors/itp_processor.rb
|
|
364
|
+
- sample/itp-crawler/config/database.yml
|
|
365
|
+
- sample/itp-crawler/config/database_itp.yml
|
|
366
|
+
- sample/itp-crawler/config/init.rb
|
|
367
|
+
- sample/itp-crawler/db/migrate/20161018044144_create_page.rb
|
|
368
|
+
- sample/itp-crawler/db/schema.rb
|
|
369
|
+
- sample/itp-crawler/db_itp/migrate/20161020044144_create_shop.rb
|
|
370
|
+
- sample/itp-crawler/db_itp/schema.rb
|
|
371
|
+
- sample/itp-crawler/lib/tasks.rb
|
|
372
|
+
- sample/itp-crawler/lib/tasks/database_tasks.rb
|
|
355
373
|
- sample/spider/Gemfile
|
|
356
374
|
- sample/spider/README.md
|
|
357
375
|
- sample/spider/Rakefile
|