daimon_skycrawlers 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/daimon_skycrawlers.gemspec +1 -1
- data/lib/daimon_skycrawlers/commands/enqueue.rb +7 -4
- data/lib/daimon_skycrawlers/crawler/base.rb +2 -0
- data/lib/daimon_skycrawlers/crawler/default.rb +0 -2
- data/lib/daimon_skycrawlers/crawler.rb +1 -0
- data/lib/daimon_skycrawlers/processor.rb +27 -0
- data/lib/daimon_skycrawlers/sitemap_parser.rb +79 -0
- data/lib/daimon_skycrawlers/storage/rdb.rb +7 -4
- data/lib/daimon_skycrawlers/tasks/database_tasks.rake +10 -11
- data/lib/daimon_skycrawlers/version.rb +1 -1
- data/lib/daimon_skycrawlers.rb +7 -0
- data/sample/itp-crawler/Gemfile +4 -0
- data/sample/itp-crawler/Gemfile.lock +106 -0
- data/sample/itp-crawler/README.md +50 -0
- data/sample/itp-crawler/Rakefile +2 -0
- data/sample/itp-crawler/app/crawlers/itp_crawler.rb +8 -0
- data/sample/itp-crawler/app/models/itp_base.rb +5 -0
- data/sample/itp-crawler/app/models/itp_shop.rb +5 -0
- data/sample/itp-crawler/app/processors/itp_processor.rb +95 -0
- data/sample/itp-crawler/config/database.yml +26 -0
- data/sample/itp-crawler/config/database_itp.yml +26 -0
- data/sample/itp-crawler/config/init.rb +22 -0
- data/sample/itp-crawler/db/migrate/20161018044144_create_page.rb +13 -0
- data/sample/itp-crawler/db/schema.rb +28 -0
- data/sample/itp-crawler/db_itp/migrate/20161020044144_create_shop.rb +14 -0
- data/sample/itp-crawler/db_itp/schema.rb +31 -0
- data/sample/itp-crawler/lib/tasks/database_tasks.rb +66 -0
- data/sample/itp-crawler/lib/tasks.rb +1 -0
- metadata +21 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e81cffca8abac23dba0da166eb2a1f6cef80b2c1
|
4
|
+
data.tar.gz: d4948c33c37ee8d9508d93c037919a784a2d21cc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4840d12acec8c75a13330810029a32c62827e35a5ca871b06dec0b1f80e7be9b8dd9fa08208a4b0daac4cd527a32ba1bdc1c893c815834ebbfe22750fd440997
|
7
|
+
data.tar.gz: 072f215aa3f245445c9ca335651fe875c238039978f3cf0eac6bc873b0d34b1c1dd4626cfa2320ce4ac5e9b1defe05ffba8ec63dcf9e5c8197472c2fa5cd9cea
|
data/daimon_skycrawlers.gemspec
CHANGED
@@ -28,7 +28,7 @@ Gem::Specification.new do |spec|
|
|
28
28
|
spec.add_dependency "railties"
|
29
29
|
spec.add_dependency "pg"
|
30
30
|
spec.add_dependency "timers"
|
31
|
-
spec.add_dependency "
|
31
|
+
spec.add_dependency "typhoeus"
|
32
32
|
spec.add_dependency "webrobots"
|
33
33
|
|
34
34
|
spec.add_development_dependency "rake", "~> 10.0"
|
@@ -1,8 +1,8 @@
|
|
1
1
|
require "daimon_skycrawlers"
|
2
2
|
require "daimon_skycrawlers/crawler"
|
3
3
|
require "daimon_skycrawlers/processor"
|
4
|
+
require "daimon_skycrawlers/sitemap_parser"
|
4
5
|
require "daimon_skycrawlers/version"
|
5
|
-
require "sitemap-parser"
|
6
6
|
require "webrobots"
|
7
7
|
|
8
8
|
module DaimonSkycrawlers
|
@@ -27,6 +27,7 @@ module DaimonSkycrawlers
|
|
27
27
|
desc "sitemap [OPTIONS] URL", "Enqueue URLs from simtemap.xml"
|
28
28
|
method_option("robots-txt", aliases: ["-r"], type: :boolean,
|
29
29
|
desc: "URL for robots.txt. Detect robots.txt automatically if URL is not robots.txt")
|
30
|
+
method_option("dump", type: :boolean, desc: "Dump URLs without enqueue")
|
30
31
|
def sitemap(url)
|
31
32
|
load_init
|
32
33
|
if options["robots-txt"]
|
@@ -35,9 +36,11 @@ module DaimonSkycrawlers
|
|
35
36
|
else
|
36
37
|
sitemaps = [url]
|
37
38
|
end
|
38
|
-
|
39
|
-
|
40
|
-
|
39
|
+
sitemap_parser = DaimonSkycrawlers::SitemapParser.new(sitemaps)
|
40
|
+
urls = sitemap_parser.parse
|
41
|
+
if options["dump"]
|
42
|
+
puts urls.join("\n")
|
43
|
+
return
|
41
44
|
end
|
42
45
|
urls.each do |_url|
|
43
46
|
DaimonSkycrawlers::Crawler.enqueue_url(_url)
|
@@ -5,6 +5,8 @@ require "daimon_skycrawlers/logger"
|
|
5
5
|
require "daimon_skycrawlers/config"
|
6
6
|
require "daimon_skycrawlers/storage"
|
7
7
|
require "daimon_skycrawlers/processor"
|
8
|
+
require "daimon_skycrawlers/filter/update_checker"
|
9
|
+
require "daimon_skycrawlers/filter/robots_txt_checker"
|
8
10
|
|
9
11
|
module DaimonSkycrawlers
|
10
12
|
module Crawler
|
@@ -6,24 +6,51 @@ require "daimon_skycrawlers/consumer/http_response"
|
|
6
6
|
module DaimonSkycrawlers
|
7
7
|
module Processor
|
8
8
|
class << self
|
9
|
+
#
|
10
|
+
# Run registered processors
|
11
|
+
#
|
12
|
+
# @param process_name [String] Process name
|
13
|
+
#
|
9
14
|
def run(process_name: default_process_name)
|
10
15
|
DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
|
11
16
|
SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::HTTPResponse]).run
|
12
17
|
end
|
13
18
|
|
19
|
+
#
|
20
|
+
# Enqueue a URL to processor queue
|
21
|
+
#
|
22
|
+
# @param [String] Specify absolute URL
|
23
|
+
# @param [Hash] Extra parameters for crawler
|
24
|
+
# @return [void]
|
14
25
|
def enqueue_http_response(url, message = {})
|
15
26
|
message[:url] = url
|
27
|
+
config.logger.debug("#{queue_name}: #{url}")
|
16
28
|
SongkickQueue.publish(queue_name, message)
|
17
29
|
end
|
18
30
|
|
31
|
+
#
|
32
|
+
# Shortcut of DaimonSkycrawlers.configuration
|
33
|
+
#
|
34
|
+
# @return [DaimonSkycrawlers::Configuration]
|
35
|
+
#
|
19
36
|
def config
|
20
37
|
DaimonSkycrawlers.configuration
|
21
38
|
end
|
22
39
|
|
40
|
+
#
|
41
|
+
# Queue name for processor
|
42
|
+
#
|
43
|
+
# @return [String] Queue name
|
44
|
+
#
|
23
45
|
def queue_name
|
24
46
|
"#{config.queue_name_prefix}.http-response"
|
25
47
|
end
|
26
48
|
|
49
|
+
#
|
50
|
+
# Default process name
|
51
|
+
#
|
52
|
+
# @return [String] Default process name
|
53
|
+
#
|
27
54
|
def default_process_name
|
28
55
|
"#{config.queue_name_prefix}:http-response"
|
29
56
|
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "typhoeus"
|
3
|
+
require "zlib"
|
4
|
+
require "uri"
|
5
|
+
|
6
|
+
module DaimonSkycrawlers
|
7
|
+
# Based on https://github.com/benbalter/sitemap-parser
|
8
|
+
class SitemapParser
|
9
|
+
def initialize(urls, options = {})
|
10
|
+
@urls = urls
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse
|
14
|
+
hydra = Typhoeus::Hydra.new(max_concurrency: 1)
|
15
|
+
sitemap_urls = []
|
16
|
+
@urls.each do |url|
|
17
|
+
if URI(url).scheme.start_with?("http")
|
18
|
+
request = Typhoeus::Request.new(url, followlocation: true)
|
19
|
+
request.on_complete do |response|
|
20
|
+
sitemap_urls.concat(on_complete(response))
|
21
|
+
end
|
22
|
+
hydra.queue(request)
|
23
|
+
else
|
24
|
+
if File.exist?(url)
|
25
|
+
extract_urls(File.read(url))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
hydra.run
|
30
|
+
sitemap_urls
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def on_complete(response)
|
36
|
+
raise "HTTP requset to #{response.effective_url} failed" unless response.success?
|
37
|
+
raw_sitemap = inflate_response(response)
|
38
|
+
extract_urls(raw_sitemap)
|
39
|
+
end
|
40
|
+
|
41
|
+
def extract_urls(body)
|
42
|
+
sitemap = Nokogiri::XML(body)
|
43
|
+
case
|
44
|
+
when sitemap.at("sitemapindex")
|
45
|
+
urls = sitemap.search("sitemap").flat_map do |s|
|
46
|
+
s.at("loc").content
|
47
|
+
end
|
48
|
+
SitemapParser.new(urls).parse
|
49
|
+
when sitemap.at("urlset")
|
50
|
+
sitemap.search("url").flat_map do |url|
|
51
|
+
url.at("loc").content
|
52
|
+
end
|
53
|
+
else
|
54
|
+
raise "Malformed sitemap.xml no <sitemapindex> or <urlset>"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def inflate_response(response)
|
59
|
+
if compressed?(response)
|
60
|
+
# We cannot inflate compressed data from NTFS filesystem (NT).
|
61
|
+
# This can avoid errors
|
62
|
+
stream = Zlib::Inflate.new(Zlib::MAX_WBITS + 32)
|
63
|
+
stream.inflate(response.body)
|
64
|
+
else
|
65
|
+
response.body
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def compressed?(response)
|
70
|
+
case response.headers["Content-Encoding"]&.downcase
|
71
|
+
when "deflate", "gzip", "x-gzip"
|
72
|
+
true
|
73
|
+
else
|
74
|
+
signature = response.body[0, 2]
|
75
|
+
signature == "\x1F\x8B".b
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -9,9 +9,8 @@ module DaimonSkycrawlers
|
|
9
9
|
class RDB < Base
|
10
10
|
def initialize(config_path = "config/database.yml")
|
11
11
|
super()
|
12
|
-
|
13
|
-
|
14
|
-
ActiveRecord::Base.establish_connection(config[environment])
|
12
|
+
Base.configurations = YAML.load_file(config_path)
|
13
|
+
Base.establish_connection(DaimonSkycrawlers.env.to_sym)
|
15
14
|
end
|
16
15
|
|
17
16
|
#
|
@@ -38,7 +37,11 @@ module DaimonSkycrawlers
|
|
38
37
|
Page.where(url: url).order(last_modified_at: :desc).limit(1).first
|
39
38
|
end
|
40
39
|
|
41
|
-
class
|
40
|
+
class Base < ActiveRecord::Base
|
41
|
+
self.abstract_class = true
|
42
|
+
end
|
43
|
+
|
44
|
+
class Page < Base
|
42
45
|
self.table_name = "pages"
|
43
46
|
end
|
44
47
|
end
|
@@ -30,24 +30,23 @@ seed_loader = Class.new do
|
|
30
30
|
end
|
31
31
|
end
|
32
32
|
|
33
|
-
ActiveRecord::Tasks::DatabaseTasks.tap do |config|
|
34
|
-
config.root = Rake.application.original_dir
|
35
|
-
config.env = ENV["SKYCRAWLERS_ENV"] || "development"
|
36
|
-
config.db_dir = "db"
|
37
|
-
config.migrations_paths = ["db/migrate"]
|
38
|
-
config.fixtures_path = "test/fixtures"
|
39
|
-
config.seed_loader = seed_loader.new
|
40
|
-
config.database_configuration = ActiveRecord::Base.configurations
|
41
|
-
end
|
42
|
-
|
43
33
|
# db:load_config can be overriden manually
|
44
34
|
Rake::Task["db:seed"].enhance(["db:load_config"])
|
45
35
|
Rake::Task["db:load_config"].clear
|
46
36
|
|
47
37
|
Rake::Task.define_task("db:environment")
|
48
38
|
Rake::Task.define_task("db:load_config") do
|
49
|
-
ActiveRecord::
|
39
|
+
ActiveRecord::Tasks::DatabaseTasks.tap do |config|
|
40
|
+
config.root = Rake.application.original_dir
|
41
|
+
config.env = ENV["SKYCRAWLERS_ENV"] || "development"
|
42
|
+
config.db_dir = "db"
|
43
|
+
config.migrations_paths = ["db/migrate"]
|
44
|
+
config.fixtures_path = "test/fixtures"
|
45
|
+
config.seed_loader = seed_loader.new
|
46
|
+
config.database_configuration = YAML.load_file("config/database.yml")
|
47
|
+
end
|
50
48
|
environment = ENV["SKYCRAWLERS_ENV"] || "development"
|
49
|
+
ActiveRecord::Base.configurations = ActiveRecord::Tasks::DatabaseTasks.database_configuration
|
51
50
|
ActiveRecord::Base.establish_connection(environment.to_sym)
|
52
51
|
end
|
53
52
|
Rake::Task["db:test:deprecated"].clear if Rake::Task.task_defined?("db:test:deprecated")
|
data/lib/daimon_skycrawlers.rb
CHANGED
@@ -0,0 +1,106 @@
|
|
1
|
+
PATH
|
2
|
+
remote: ../../
|
3
|
+
specs:
|
4
|
+
daimon_skycrawlers (0.6.0)
|
5
|
+
activerecord
|
6
|
+
bundler (~> 1.11)
|
7
|
+
faraday
|
8
|
+
faraday_middleware
|
9
|
+
nokogiri
|
10
|
+
pg
|
11
|
+
railties
|
12
|
+
songkick_queue
|
13
|
+
thor
|
14
|
+
timers
|
15
|
+
typhoeus
|
16
|
+
webrobots
|
17
|
+
|
18
|
+
GEM
|
19
|
+
remote: https://rubygems.org/
|
20
|
+
specs:
|
21
|
+
actionpack (5.0.0.1)
|
22
|
+
actionview (= 5.0.0.1)
|
23
|
+
activesupport (= 5.0.0.1)
|
24
|
+
rack (~> 2.0)
|
25
|
+
rack-test (~> 0.6.3)
|
26
|
+
rails-dom-testing (~> 2.0)
|
27
|
+
rails-html-sanitizer (~> 1.0, >= 1.0.2)
|
28
|
+
actionview (5.0.0.1)
|
29
|
+
activesupport (= 5.0.0.1)
|
30
|
+
builder (~> 3.1)
|
31
|
+
erubis (~> 2.7.0)
|
32
|
+
rails-dom-testing (~> 2.0)
|
33
|
+
rails-html-sanitizer (~> 1.0, >= 1.0.2)
|
34
|
+
activemodel (5.0.0.1)
|
35
|
+
activesupport (= 5.0.0.1)
|
36
|
+
activerecord (5.0.0.1)
|
37
|
+
activemodel (= 5.0.0.1)
|
38
|
+
activesupport (= 5.0.0.1)
|
39
|
+
arel (~> 7.0)
|
40
|
+
activesupport (5.0.0.1)
|
41
|
+
concurrent-ruby (~> 1.0, >= 1.0.2)
|
42
|
+
i18n (~> 0.7)
|
43
|
+
minitest (~> 5.1)
|
44
|
+
tzinfo (~> 1.1)
|
45
|
+
amq-protocol (2.0.1)
|
46
|
+
arel (7.1.4)
|
47
|
+
builder (3.2.2)
|
48
|
+
bunny (2.6.0)
|
49
|
+
amq-protocol (>= 2.0.1)
|
50
|
+
concurrent-ruby (1.0.2)
|
51
|
+
erubis (2.7.0)
|
52
|
+
ethon (0.9.1)
|
53
|
+
ffi (>= 1.3.0)
|
54
|
+
faraday (0.9.2)
|
55
|
+
multipart-post (>= 1.2, < 3)
|
56
|
+
faraday_middleware (0.10.0)
|
57
|
+
faraday (>= 0.7.4, < 0.10)
|
58
|
+
ffi (1.9.14)
|
59
|
+
hitimes (1.2.4)
|
60
|
+
i18n (0.7.0)
|
61
|
+
loofah (2.0.3)
|
62
|
+
nokogiri (>= 1.5.9)
|
63
|
+
method_source (0.8.2)
|
64
|
+
mini_portile2 (2.1.0)
|
65
|
+
minitest (5.9.1)
|
66
|
+
multipart-post (2.0.0)
|
67
|
+
nokogiri (1.6.8.1)
|
68
|
+
mini_portile2 (~> 2.1.0)
|
69
|
+
pg (0.19.0)
|
70
|
+
rack (2.0.1)
|
71
|
+
rack-test (0.6.3)
|
72
|
+
rack (>= 1.0)
|
73
|
+
rails-dom-testing (2.0.1)
|
74
|
+
activesupport (>= 4.2.0, < 6.0)
|
75
|
+
nokogiri (~> 1.6.0)
|
76
|
+
rails-html-sanitizer (1.0.3)
|
77
|
+
loofah (~> 2.0)
|
78
|
+
railties (5.0.0.1)
|
79
|
+
actionpack (= 5.0.0.1)
|
80
|
+
activesupport (= 5.0.0.1)
|
81
|
+
method_source
|
82
|
+
rake (>= 0.8.7)
|
83
|
+
thor (>= 0.18.1, < 2.0)
|
84
|
+
rake (11.3.0)
|
85
|
+
songkick_queue (1.0.0)
|
86
|
+
activesupport (>= 3.0.0)
|
87
|
+
bunny (~> 2.2)
|
88
|
+
thor (0.19.1)
|
89
|
+
thread_safe (0.3.5)
|
90
|
+
timers (4.1.1)
|
91
|
+
hitimes
|
92
|
+
typhoeus (0.8.0)
|
93
|
+
ethon (>= 0.8.0)
|
94
|
+
tzinfo (1.2.2)
|
95
|
+
thread_safe (~> 0.1)
|
96
|
+
webrobots (0.1.2)
|
97
|
+
|
98
|
+
PLATFORMS
|
99
|
+
ruby
|
100
|
+
|
101
|
+
DEPENDENCIES
|
102
|
+
daimon_skycrawlers!
|
103
|
+
rake
|
104
|
+
|
105
|
+
BUNDLED WITH
|
106
|
+
1.12.5
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# itp-crawler
|
2
|
+
|
3
|
+
Simple crawler for [iタウンページ](http://itp.ne.jp)
|
4
|
+
|
5
|
+
## Requirements
|
6
|
+
|
7
|
+
- Ruby
|
8
|
+
- RabbitMQ
|
9
|
+
- RDB
|
10
|
+
- PostgreSQL (default)
|
11
|
+
- MySQL
|
12
|
+
- SQLite3
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
|
16
|
+
1. Install dependencies
|
17
|
+
|
18
|
+
```
|
19
|
+
$ bundle install
|
20
|
+
```
|
21
|
+
|
22
|
+
2. Create database
|
23
|
+
|
24
|
+
```
|
25
|
+
$ bundle exec rake db:create
|
26
|
+
$ bundle exec rake db:migrate
|
27
|
+
$ bundle exec rake itp:db:create
|
28
|
+
$ bundle exec rake itp:db:migrate
|
29
|
+
```
|
30
|
+
|
31
|
+
3. Open new terminal and run crawler/processor
|
32
|
+
|
33
|
+
```
|
34
|
+
$ bundle exec daimon_skycrawlers exec crawler # on new terminal
|
35
|
+
$ bundle exec daimon_skycrawlers exec processor # on new terminal
|
36
|
+
```
|
37
|
+
|
38
|
+
4. Enqueue task
|
39
|
+
|
40
|
+
```
|
41
|
+
$ bundle exec daimon_skycrawlers enqueue url "http://itp.ne.jp/osaka/genre_dir/niku/?num=50"
|
42
|
+
```
|
43
|
+
|
44
|
+
5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
|
45
|
+
|
46
|
+
6. You can re-enqueue task for processor
|
47
|
+
|
48
|
+
```
|
49
|
+
$ bundle exec daimon_skycrawlers enqueue response "http://itp.ne.jp/osaka/genre_dir/niku/?num=50"
|
50
|
+
```
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require "daimon_skycrawlers"
|
2
|
+
require "daimon_skycrawlers/processor"
|
3
|
+
require "daimon_skycrawlers/processor/base"
|
4
|
+
|
5
|
+
require_relative "../models/itp_shop"
|
6
|
+
|
7
|
+
class ItpProcessor < DaimonSkycrawlers::Processor::Base
|
8
|
+
def call(message)
|
9
|
+
key_url = message[:url]
|
10
|
+
page = storage.find(key_url)
|
11
|
+
@doc = Nokogiri::HTML(page.body.encode("UTF-8", "CP932"))
|
12
|
+
ItpShop.transaction do
|
13
|
+
prepare_shops do |shop|
|
14
|
+
itp_shop = ItpShop.find_or_initialize_by(itp_url: shop.itp_url)
|
15
|
+
itp_shop.assign_attributes(shop.to_h)
|
16
|
+
itp_shop.save!
|
17
|
+
end
|
18
|
+
end
|
19
|
+
unless %r(/pg/) =~ key_url
|
20
|
+
enqueue_pages(key_url)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
Shop = Struct.new(:name, :description, :itp_url, :zip_code, :address, :phone)
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def prepare_shops
|
29
|
+
@doc.search(".normalResultsBox").each do |shop|
|
30
|
+
begin
|
31
|
+
name_element = shop.at("section h4 .blueText") || shop.at("section h4 .brackText")
|
32
|
+
name = name_element.content.strip
|
33
|
+
description = shop.at("section p").content.strip
|
34
|
+
# Avoid false detection for shop address
|
35
|
+
if description.start_with?("住所")
|
36
|
+
description = nil
|
37
|
+
end
|
38
|
+
itp_path = shop.at("section h4 a").attr("href")
|
39
|
+
phone = shop.at("section p b").content.strip
|
40
|
+
address_element = shop.search("section p").detect do |element|
|
41
|
+
/住所/ =~ element.content
|
42
|
+
end
|
43
|
+
address_element.search("span").unlink
|
44
|
+
address_element.search("a").unlink
|
45
|
+
address_text = address_element.content.strip
|
46
|
+
zip_code = address_text.slice(/〒(\d{3}-\d{4})(.+)/, 1)
|
47
|
+
address = address_text.slice(/〒(\d{3}-\d{4})(.+)/, 2).sub(/\A[[:space:]]+/, "")
|
48
|
+
s = Shop.new(name,
|
49
|
+
description,
|
50
|
+
retrieve_individual_page_url(itp_path),
|
51
|
+
zip_code,
|
52
|
+
address,
|
53
|
+
phone)
|
54
|
+
yield s
|
55
|
+
rescue => e
|
56
|
+
log.warn("#{e.class}: #{e.message}")
|
57
|
+
log.debug(e.backtrace.join("\n"))
|
58
|
+
break
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# NOTE: HEAD request to itp.ne.jp is so slow
|
64
|
+
#
|
65
|
+
# /shop/KN2700060500184274/?url=%2F0663026300%2F&s_bid=KN2700060500184274&s_sid=FSP-LSR-002&s_fr=V01&s_ck=C01
|
66
|
+
# http://nttbj.itp.ne.jp/0663026300/index.html
|
67
|
+
#
|
68
|
+
# or
|
69
|
+
#
|
70
|
+
# /shop/KN2700060500039708/
|
71
|
+
# http://itp.ne.jp/shop/KN2700060500039708/
|
72
|
+
def retrieve_individual_page_url(path)
|
73
|
+
shop_id = path.slice(/\/\?url=(.+)&/, 1)
|
74
|
+
uri = if shop_id
|
75
|
+
URI("http://nttbj.itp.ne.jp/") + URI.unescape(shop_id) + "index.html"
|
76
|
+
else
|
77
|
+
URI("http://itp.ne.jp/") + path
|
78
|
+
end
|
79
|
+
uri.to_s
|
80
|
+
end
|
81
|
+
|
82
|
+
MAX_PAGE_NUM = 100
|
83
|
+
|
84
|
+
def enqueue_pages(base_url)
|
85
|
+
search_result = @doc.at("h1.searchResultHeader").content.strip.slice(/(\d+)件/, 1).to_i
|
86
|
+
# itp.ne.jp can displays 5000 search results.
|
87
|
+
2.upto([(search_result / 50), MAX_PAGE_NUM].min) do |n|
|
88
|
+
url = URI.join(base_url, "pg/#{n}/?num=50")
|
89
|
+
DaimonSkycrawlers::Crawler.enqueue_url(url.to_s)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
processor = ItpProcessor.new
|
95
|
+
DaimonSkycrawlers.register_processor(processor)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# PostgreSQL. Versions 8.2 and up are supported.
|
2
|
+
#
|
3
|
+
default: &default
|
4
|
+
adapter: postgresql
|
5
|
+
encoding: unicode
|
6
|
+
pool: 5
|
7
|
+
|
8
|
+
development:
|
9
|
+
<<: *default
|
10
|
+
database: itp-crawler_development
|
11
|
+
#username: itp-crawler
|
12
|
+
#password:
|
13
|
+
#host: localhost
|
14
|
+
#port: 5432
|
15
|
+
#schema_search_path: myapp,sharedapp,public
|
16
|
+
#min_messages: notice
|
17
|
+
|
18
|
+
test:
|
19
|
+
<<: *default
|
20
|
+
database: itp-crawler_test
|
21
|
+
|
22
|
+
production:
|
23
|
+
<<: *default
|
24
|
+
database: itp-crawler_production
|
25
|
+
username: itp-crawler
|
26
|
+
password: <%= ENV['ITP-CRAWLER_PASSWORD'] %>
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# PostgreSQL. Versions 8.2 and up are supported.
|
2
|
+
#
|
3
|
+
default: &default
|
4
|
+
adapter: postgresql
|
5
|
+
encoding: unicode
|
6
|
+
pool: 5
|
7
|
+
|
8
|
+
development:
|
9
|
+
<<: *default
|
10
|
+
database: itp-processor_development
|
11
|
+
#username: itp-processor
|
12
|
+
#password:
|
13
|
+
#host: localhost
|
14
|
+
#port: 5432
|
15
|
+
#schema_search_path: myapp,sharedapp,public
|
16
|
+
#min_messages: notice
|
17
|
+
|
18
|
+
test:
|
19
|
+
<<: *default
|
20
|
+
database: itp-processor_test
|
21
|
+
|
22
|
+
production:
|
23
|
+
<<: *default
|
24
|
+
database: itp-processor_production
|
25
|
+
username: itp-processor
|
26
|
+
password: <%= ENV['ITP-PROCESSOR_PASSWORD'] %>
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
require "daimon_skycrawlers"
|
3
|
+
require "daimon_skycrawlers/logger"
|
4
|
+
require "daimon_skycrawlers/queue"
|
5
|
+
|
6
|
+
DaimonSkycrawlers.configure do |config|
|
7
|
+
config.logger = DaimonSkycrawlers::Logger.default
|
8
|
+
config.crawler_interval = 1
|
9
|
+
config.shutdown_interval = 30
|
10
|
+
end
|
11
|
+
|
12
|
+
DaimonSkycrawlers::Queue.configure do |config|
|
13
|
+
# queue configuration
|
14
|
+
config.logger = DaimonSkycrawlers.configuration.logger
|
15
|
+
config.host = "127.0.0.1"
|
16
|
+
config.port = 5672
|
17
|
+
# config.username = 'guest'
|
18
|
+
# config.password = 'guest'
|
19
|
+
config.vhost = "/"
|
20
|
+
config.max_reconnect_attempts = 10
|
21
|
+
config.network_recovery_interval = 1.0
|
22
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# This file is auto-generated from the current state of the database. Instead
|
2
|
+
# of editing this file, please use the migrations feature of Active Record to
|
3
|
+
# incrementally modify your database, and then regenerate this schema definition.
|
4
|
+
#
|
5
|
+
# Note that this schema.rb definition is the authoritative source for your
|
6
|
+
# database schema. If you need to create the application database on another
|
7
|
+
# system, you should be using db:schema:load, not running all the migrations
|
8
|
+
# from scratch. The latter is a flawed and unsustainable approach (the more migrations
|
9
|
+
# you'll amass, the slower it'll run and the greater likelihood for issues).
|
10
|
+
#
|
11
|
+
# It's strongly recommended that you check this file into your version control system.
|
12
|
+
|
13
|
+
ActiveRecord::Schema.define(version: 20161018044144) do
|
14
|
+
|
15
|
+
# These are extensions that must be enabled in order to support this database
|
16
|
+
enable_extension "plpgsql"
|
17
|
+
|
18
|
+
create_table "pages", force: :cascade do |t|
|
19
|
+
t.string "url"
|
20
|
+
t.text "headers"
|
21
|
+
t.binary "body"
|
22
|
+
t.datetime "last_modified_at"
|
23
|
+
t.string "etag"
|
24
|
+
t.datetime "created_at", null: false
|
25
|
+
t.datetime "updated_at", null: false
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
class CreateShop < ActiveRecord::Migration[5.0]
|
2
|
+
def change
|
3
|
+
create_table :shops do |t|
|
4
|
+
t.string :name, index: true
|
5
|
+
t.text :description
|
6
|
+
t.string :itp_url, index: true
|
7
|
+
t.string :zip_code
|
8
|
+
t.string :address
|
9
|
+
t.string :phone
|
10
|
+
|
11
|
+
t.timestamps
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# This file is auto-generated from the current state of the database. Instead
|
2
|
+
# of editing this file, please use the migrations feature of Active Record to
|
3
|
+
# incrementally modify your database, and then regenerate this schema definition.
|
4
|
+
#
|
5
|
+
# Note that this schema.rb definition is the authoritative source for your
|
6
|
+
# database schema. If you need to create the application database on another
|
7
|
+
# system, you should be using db:schema:load, not running all the migrations
|
8
|
+
# from scratch. The latter is a flawed and unsustainable approach (the more migrations
|
9
|
+
# you'll amass, the slower it'll run and the greater likelihood for issues).
|
10
|
+
#
|
11
|
+
# It's strongly recommended that you check this file into your version control system.
|
12
|
+
|
13
|
+
ActiveRecord::Schema.define(version: 20161020044144) do
|
14
|
+
|
15
|
+
# These are extensions that must be enabled in order to support this database
|
16
|
+
enable_extension "plpgsql"
|
17
|
+
|
18
|
+
create_table "shops", force: :cascade do |t|
|
19
|
+
t.string "name"
|
20
|
+
t.text "description"
|
21
|
+
t.string "itp_url"
|
22
|
+
t.string "zip_code"
|
23
|
+
t.string "address"
|
24
|
+
t.string "phone"
|
25
|
+
t.datetime "created_at", null: false
|
26
|
+
t.datetime "updated_at", null: false
|
27
|
+
t.index ["itp_url"], name: "index_shops_on_itp_url", using: :btree
|
28
|
+
t.index ["name"], name: "index_shops_on_name", using: :btree
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require "daimon_skycrawlers"
|
2
|
+
|
3
|
+
seed_loader = Class.new do
|
4
|
+
def load_seed
|
5
|
+
# load "#{ActiveRecord::Tasks::DatabaseTasks.db_dir}/seeds.rb"
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
namespace :itp do
|
10
|
+
namespace :db do |ns|
|
11
|
+
task :drop => [:load_config] do
|
12
|
+
ActiveRecord::Tasks::DatabaseTasks.drop_current
|
13
|
+
end
|
14
|
+
|
15
|
+
task :create => [:load_config] do
|
16
|
+
ActiveRecord::Tasks::DatabaseTasks.create_current
|
17
|
+
end
|
18
|
+
|
19
|
+
task :migrate => [:load_config] do
|
20
|
+
ActiveRecord::Tasks::DatabaseTasks.migrate
|
21
|
+
ns["_dump"].invoke
|
22
|
+
end
|
23
|
+
|
24
|
+
task :_dump do
|
25
|
+
ns["schema:dump"].invoke
|
26
|
+
ns["_dump"].reenable
|
27
|
+
end
|
28
|
+
|
29
|
+
task :version => [:load_config] do
|
30
|
+
puts "Current version: #{ActiveRecord::Migrator.current_version}"
|
31
|
+
end
|
32
|
+
|
33
|
+
namespace :migrate do
|
34
|
+
task :reset => ["db:drop", "db:create", "db:migrate"]
|
35
|
+
end
|
36
|
+
|
37
|
+
namespace :schema do
|
38
|
+
task :dump => [:load_config] do
|
39
|
+
require "active_record/schema_dumper"
|
40
|
+
filename = ENV["SCHEMA"] || File.join(ActiveRecord::Tasks::DatabaseTasks.db_dir, "schema.rb")
|
41
|
+
File.open(filename, "w:utf-8") do |file|
|
42
|
+
ActiveRecord::SchemaDumper.dump(ActiveRecord::Base.connection, file)
|
43
|
+
end
|
44
|
+
ns["schema:dump"].reenable
|
45
|
+
end
|
46
|
+
|
47
|
+
task :load => [:load_config] do
|
48
|
+
ActiveRecord::Tasks::DatabaseTasks.load_schema_current(:ruby, ENV["SCHEMA"])
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
task :load_config do
|
53
|
+
ActiveRecord::Tasks::DatabaseTasks.tap do |config|
|
54
|
+
config.root = Rake.application.original_dir
|
55
|
+
config.env = ENV["SKYCRAWLERS_ENV"] || "development"
|
56
|
+
config.db_dir = "db_itp"
|
57
|
+
config.migrations_paths = ["db_itp/migrate"]
|
58
|
+
config.fixtures_path = "test/fixtures"
|
59
|
+
config.seed_loader = seed_loader.new
|
60
|
+
config.database_configuration = YAML.load_file("config/database_itp.yml")
|
61
|
+
end
|
62
|
+
ActiveRecord::Base.configurations = YAML.load_file("config/database_itp.yml")
|
63
|
+
ActiveRecord::Base.establish_connection(DaimonSkycrawlers.env.to_sym)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require_relative "./tasks/database_tasks"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: daimon_skycrawlers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryunosuke SATO
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-10-
|
11
|
+
date: 2016-10-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -151,7 +151,7 @@ dependencies:
|
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: '0'
|
153
153
|
- !ruby/object:Gem::Dependency
|
154
|
-
name:
|
154
|
+
name: typhoeus
|
155
155
|
requirement: !ruby/object:Gem::Requirement
|
156
156
|
requirements:
|
157
157
|
- - ">="
|
@@ -343,6 +343,7 @@ files:
|
|
343
343
|
- lib/daimon_skycrawlers/processor/default.rb
|
344
344
|
- lib/daimon_skycrawlers/processor/spider.rb
|
345
345
|
- lib/daimon_skycrawlers/queue.rb
|
346
|
+
- lib/daimon_skycrawlers/sitemap_parser.rb
|
346
347
|
- lib/daimon_skycrawlers/storage.rb
|
347
348
|
- lib/daimon_skycrawlers/storage/base.rb
|
348
349
|
- lib/daimon_skycrawlers/storage/file.rb
|
@@ -352,6 +353,23 @@ files:
|
|
352
353
|
- lib/daimon_skycrawlers/tasks/database_tasks.rake
|
353
354
|
- lib/daimon_skycrawlers/timer.rb
|
354
355
|
- lib/daimon_skycrawlers/version.rb
|
356
|
+
- sample/itp-crawler/Gemfile
|
357
|
+
- sample/itp-crawler/Gemfile.lock
|
358
|
+
- sample/itp-crawler/README.md
|
359
|
+
- sample/itp-crawler/Rakefile
|
360
|
+
- sample/itp-crawler/app/crawlers/itp_crawler.rb
|
361
|
+
- sample/itp-crawler/app/models/itp_base.rb
|
362
|
+
- sample/itp-crawler/app/models/itp_shop.rb
|
363
|
+
- sample/itp-crawler/app/processors/itp_processor.rb
|
364
|
+
- sample/itp-crawler/config/database.yml
|
365
|
+
- sample/itp-crawler/config/database_itp.yml
|
366
|
+
- sample/itp-crawler/config/init.rb
|
367
|
+
- sample/itp-crawler/db/migrate/20161018044144_create_page.rb
|
368
|
+
- sample/itp-crawler/db/schema.rb
|
369
|
+
- sample/itp-crawler/db_itp/migrate/20161020044144_create_shop.rb
|
370
|
+
- sample/itp-crawler/db_itp/schema.rb
|
371
|
+
- sample/itp-crawler/lib/tasks.rb
|
372
|
+
- sample/itp-crawler/lib/tasks/database_tasks.rb
|
355
373
|
- sample/spider/Gemfile
|
356
374
|
- sample/spider/README.md
|
357
375
|
- sample/spider/Rakefile
|