daimon_skycrawlers 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1658e1827b07bfebb54e4d84b3a9dd21f11edd66
4
- data.tar.gz: 5abf3dc3cbb8d8423b8f110e2c805ca1d919f59c
3
+ metadata.gz: e81cffca8abac23dba0da166eb2a1f6cef80b2c1
4
+ data.tar.gz: d4948c33c37ee8d9508d93c037919a784a2d21cc
5
5
  SHA512:
6
- metadata.gz: 16ff307268fa11cc34454808f8bdbefded16a325e043c349ae07e48758fa8dcaae69457bbd7435e2685fa7026d8848677513780f9e1d7b42352acd54f5f33cdb
7
- data.tar.gz: 841d61776c8094e6d1eebb149e72dc42a1c9a71fdc453c34f06dea465a5838c37db633bed7f79a93c3beb8827e05ca549fc3e8cd0cd8e3f9e17b52e349f5c787
6
+ metadata.gz: 4840d12acec8c75a13330810029a32c62827e35a5ca871b06dec0b1f80e7be9b8dd9fa08208a4b0daac4cd527a32ba1bdc1c893c815834ebbfe22750fd440997
7
+ data.tar.gz: 072f215aa3f245445c9ca335651fe875c238039978f3cf0eac6bc873b0d34b1c1dd4626cfa2320ce4ac5e9b1defe05ffba8ec63dcf9e5c8197472c2fa5cd9cea
@@ -28,7 +28,7 @@ Gem::Specification.new do |spec|
28
28
  spec.add_dependency "railties"
29
29
  spec.add_dependency "pg"
30
30
  spec.add_dependency "timers"
31
- spec.add_dependency "sitemap-parser"
31
+ spec.add_dependency "typhoeus"
32
32
  spec.add_dependency "webrobots"
33
33
 
34
34
  spec.add_development_dependency "rake", "~> 10.0"
@@ -1,8 +1,8 @@
1
1
  require "daimon_skycrawlers"
2
2
  require "daimon_skycrawlers/crawler"
3
3
  require "daimon_skycrawlers/processor"
4
+ require "daimon_skycrawlers/sitemap_parser"
4
5
  require "daimon_skycrawlers/version"
5
- require "sitemap-parser"
6
6
  require "webrobots"
7
7
 
8
8
  module DaimonSkycrawlers
@@ -27,6 +27,7 @@ module DaimonSkycrawlers
27
27
  desc "sitemap [OPTIONS] URL", "Enqueue URLs from simtemap.xml"
28
28
  method_option("robots-txt", aliases: ["-r"], type: :boolean,
29
29
  desc: "URL for robots.txt. Detect robots.txt automatically if URL is not robots.txt")
30
+ method_option("dump", type: :boolean, desc: "Dump URLs without enqueue")
30
31
  def sitemap(url)
31
32
  load_init
32
33
  if options["robots-txt"]
@@ -35,9 +36,11 @@ module DaimonSkycrawlers
35
36
  else
36
37
  sitemaps = [url]
37
38
  end
38
- urls = sitemaps.flat_map do |sitemap|
39
- sitemap_parser = SitemapParser.new(sitemap)
40
- sitemap_parser.to_a
39
+ sitemap_parser = DaimonSkycrawlers::SitemapParser.new(sitemaps)
40
+ urls = sitemap_parser.parse
41
+ if options["dump"]
42
+ puts urls.join("\n")
43
+ return
41
44
  end
42
45
  urls.each do |_url|
43
46
  DaimonSkycrawlers::Crawler.enqueue_url(_url)
@@ -5,6 +5,8 @@ require "daimon_skycrawlers/logger"
5
5
  require "daimon_skycrawlers/config"
6
6
  require "daimon_skycrawlers/storage"
7
7
  require "daimon_skycrawlers/processor"
8
+ require "daimon_skycrawlers/filter/update_checker"
9
+ require "daimon_skycrawlers/filter/robots_txt_checker"
8
10
 
9
11
  module DaimonSkycrawlers
10
12
  module Crawler
@@ -1,6 +1,4 @@
1
1
  require "daimon_skycrawlers/crawler/base"
2
- require "daimon_skycrawlers/filter/update_checker"
3
- require "daimon_skycrawlers/filter/robots_txt_checker"
4
2
 
5
3
  module DaimonSkycrawlers
6
4
  module Crawler
@@ -24,6 +24,7 @@ module DaimonSkycrawlers
24
24
  # @return [void]
25
25
  def enqueue_url(url, message = {})
26
26
  message[:url] = url
27
+ config.logger.debug("#{queue_name}: #{url}")
27
28
  SongkickQueue.publish(queue_name, message)
28
29
  end
29
30
 
@@ -6,24 +6,51 @@ require "daimon_skycrawlers/consumer/http_response"
6
6
  module DaimonSkycrawlers
7
7
  module Processor
8
8
  class << self
9
+ #
10
+ # Run registered processors
11
+ #
12
+ # @param process_name [String] Process name
13
+ #
9
14
  def run(process_name: default_process_name)
10
15
  DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
11
16
  SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::HTTPResponse]).run
12
17
  end
13
18
 
19
+ #
20
+ # Enqueue a URL to processor queue
21
+ #
22
+ # @param [String] Specify absolute URL
23
+ # @param [Hash] Extra parameters for crawler
24
+ # @return [void]
14
25
  def enqueue_http_response(url, message = {})
15
26
  message[:url] = url
27
+ config.logger.debug("#{queue_name}: #{url}")
16
28
  SongkickQueue.publish(queue_name, message)
17
29
  end
18
30
 
31
+ #
32
+ # Shortcut of DaimonSkycrawlers.configuration
33
+ #
34
+ # @return [DaimonSkycrawlers::Configuration]
35
+ #
19
36
  def config
20
37
  DaimonSkycrawlers.configuration
21
38
  end
22
39
 
40
+ #
41
+ # Queue name for processor
42
+ #
43
+ # @return [String] Queue name
44
+ #
23
45
  def queue_name
24
46
  "#{config.queue_name_prefix}.http-response"
25
47
  end
26
48
 
49
+ #
50
+ # Default process name
51
+ #
52
+ # @return [String] Default process name
53
+ #
27
54
  def default_process_name
28
55
  "#{config.queue_name_prefix}:http-response"
29
56
  end
@@ -0,0 +1,79 @@
1
+ require "nokogiri"
2
+ require "typhoeus"
3
+ require "zlib"
4
+ require "uri"
5
+
6
+ module DaimonSkycrawlers
7
+ # Based on https://github.com/benbalter/sitemap-parser
8
+ class SitemapParser
9
+ def initialize(urls, options = {})
10
+ @urls = urls
11
+ end
12
+
13
+ def parse
14
+ hydra = Typhoeus::Hydra.new(max_concurrency: 1)
15
+ sitemap_urls = []
16
+ @urls.each do |url|
17
+ if URI(url).scheme.start_with?("http")
18
+ request = Typhoeus::Request.new(url, followlocation: true)
19
+ request.on_complete do |response|
20
+ sitemap_urls.concat(on_complete(response))
21
+ end
22
+ hydra.queue(request)
23
+ else
24
+ if File.exist?(url)
25
+ extract_urls(File.read(url))
26
+ end
27
+ end
28
+ end
29
+ hydra.run
30
+ sitemap_urls
31
+ end
32
+
33
+ private
34
+
35
+ def on_complete(response)
36
+ raise "HTTP requset to #{response.effective_url} failed" unless response.success?
37
+ raw_sitemap = inflate_response(response)
38
+ extract_urls(raw_sitemap)
39
+ end
40
+
41
+ def extract_urls(body)
42
+ sitemap = Nokogiri::XML(body)
43
+ case
44
+ when sitemap.at("sitemapindex")
45
+ urls = sitemap.search("sitemap").flat_map do |s|
46
+ s.at("loc").content
47
+ end
48
+ SitemapParser.new(urls).parse
49
+ when sitemap.at("urlset")
50
+ sitemap.search("url").flat_map do |url|
51
+ url.at("loc").content
52
+ end
53
+ else
54
+ raise "Malformed sitemap.xml no <sitemapindex> or <urlset>"
55
+ end
56
+ end
57
+
58
+ def inflate_response(response)
59
+ if compressed?(response)
60
+ # We cannot inflate compressed data from NTFS filesystem (NT).
61
+ # This can avoid errors
62
+ stream = Zlib::Inflate.new(Zlib::MAX_WBITS + 32)
63
+ stream.inflate(response.body)
64
+ else
65
+ response.body
66
+ end
67
+ end
68
+
69
+ def compressed?(response)
70
+ case response.headers["Content-Encoding"]&.downcase
71
+ when "deflate", "gzip", "x-gzip"
72
+ true
73
+ else
74
+ signature = response.body[0, 2]
75
+ signature == "\x1F\x8B".b
76
+ end
77
+ end
78
+ end
79
+ end
@@ -9,9 +9,8 @@ module DaimonSkycrawlers
9
9
  class RDB < Base
10
10
  def initialize(config_path = "config/database.yml")
11
11
  super()
12
- config = YAML.load_file(config_path)
13
- environment = ENV["SKYCRAWLERS_ENV"] || "development"
14
- ActiveRecord::Base.establish_connection(config[environment])
12
+ Base.configurations = YAML.load_file(config_path)
13
+ Base.establish_connection(DaimonSkycrawlers.env.to_sym)
15
14
  end
16
15
 
17
16
  #
@@ -38,7 +37,11 @@ module DaimonSkycrawlers
38
37
  Page.where(url: url).order(last_modified_at: :desc).limit(1).first
39
38
  end
40
39
 
41
- class Page < ActiveRecord::Base
40
+ class Base < ActiveRecord::Base
41
+ self.abstract_class = true
42
+ end
43
+
44
+ class Page < Base
42
45
  self.table_name = "pages"
43
46
  end
44
47
  end
@@ -30,24 +30,23 @@ seed_loader = Class.new do
30
30
  end
31
31
  end
32
32
 
33
- ActiveRecord::Tasks::DatabaseTasks.tap do |config|
34
- config.root = Rake.application.original_dir
35
- config.env = ENV["SKYCRAWLERS_ENV"] || "development"
36
- config.db_dir = "db"
37
- config.migrations_paths = ["db/migrate"]
38
- config.fixtures_path = "test/fixtures"
39
- config.seed_loader = seed_loader.new
40
- config.database_configuration = ActiveRecord::Base.configurations
41
- end
42
-
43
33
  # db:load_config can be overriden manually
44
34
  Rake::Task["db:seed"].enhance(["db:load_config"])
45
35
  Rake::Task["db:load_config"].clear
46
36
 
47
37
  Rake::Task.define_task("db:environment")
48
38
  Rake::Task.define_task("db:load_config") do
49
- ActiveRecord::Base.configurations = YAML.load_file("config/database.yml")
39
+ ActiveRecord::Tasks::DatabaseTasks.tap do |config|
40
+ config.root = Rake.application.original_dir
41
+ config.env = ENV["SKYCRAWLERS_ENV"] || "development"
42
+ config.db_dir = "db"
43
+ config.migrations_paths = ["db/migrate"]
44
+ config.fixtures_path = "test/fixtures"
45
+ config.seed_loader = seed_loader.new
46
+ config.database_configuration = YAML.load_file("config/database.yml")
47
+ end
50
48
  environment = ENV["SKYCRAWLERS_ENV"] || "development"
49
+ ActiveRecord::Base.configurations = ActiveRecord::Tasks::DatabaseTasks.database_configuration
51
50
  ActiveRecord::Base.establish_connection(environment.to_sym)
52
51
  end
53
52
  Rake::Task["db:test:deprecated"].clear if Rake::Task.task_defined?("db:test:deprecated")
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.6.0"
2
+ VERSION = "0.7.0"
3
3
  end
@@ -74,5 +74,12 @@ module DaimonSkycrawlers
74
74
  puts ex.message
75
75
  exit(false)
76
76
  end
77
+
78
+ #
79
+ # Return current environment
80
+ #
81
+ def env
82
+ ENV["SKYCRAWLERS_ENV"] || "development"
83
+ end
77
84
  end
78
85
  end
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem "rake"
4
+ gem "daimon_skycrawlers", path: "../../"
@@ -0,0 +1,106 @@
1
+ PATH
2
+ remote: ../../
3
+ specs:
4
+ daimon_skycrawlers (0.6.0)
5
+ activerecord
6
+ bundler (~> 1.11)
7
+ faraday
8
+ faraday_middleware
9
+ nokogiri
10
+ pg
11
+ railties
12
+ songkick_queue
13
+ thor
14
+ timers
15
+ typhoeus
16
+ webrobots
17
+
18
+ GEM
19
+ remote: https://rubygems.org/
20
+ specs:
21
+ actionpack (5.0.0.1)
22
+ actionview (= 5.0.0.1)
23
+ activesupport (= 5.0.0.1)
24
+ rack (~> 2.0)
25
+ rack-test (~> 0.6.3)
26
+ rails-dom-testing (~> 2.0)
27
+ rails-html-sanitizer (~> 1.0, >= 1.0.2)
28
+ actionview (5.0.0.1)
29
+ activesupport (= 5.0.0.1)
30
+ builder (~> 3.1)
31
+ erubis (~> 2.7.0)
32
+ rails-dom-testing (~> 2.0)
33
+ rails-html-sanitizer (~> 1.0, >= 1.0.2)
34
+ activemodel (5.0.0.1)
35
+ activesupport (= 5.0.0.1)
36
+ activerecord (5.0.0.1)
37
+ activemodel (= 5.0.0.1)
38
+ activesupport (= 5.0.0.1)
39
+ arel (~> 7.0)
40
+ activesupport (5.0.0.1)
41
+ concurrent-ruby (~> 1.0, >= 1.0.2)
42
+ i18n (~> 0.7)
43
+ minitest (~> 5.1)
44
+ tzinfo (~> 1.1)
45
+ amq-protocol (2.0.1)
46
+ arel (7.1.4)
47
+ builder (3.2.2)
48
+ bunny (2.6.0)
49
+ amq-protocol (>= 2.0.1)
50
+ concurrent-ruby (1.0.2)
51
+ erubis (2.7.0)
52
+ ethon (0.9.1)
53
+ ffi (>= 1.3.0)
54
+ faraday (0.9.2)
55
+ multipart-post (>= 1.2, < 3)
56
+ faraday_middleware (0.10.0)
57
+ faraday (>= 0.7.4, < 0.10)
58
+ ffi (1.9.14)
59
+ hitimes (1.2.4)
60
+ i18n (0.7.0)
61
+ loofah (2.0.3)
62
+ nokogiri (>= 1.5.9)
63
+ method_source (0.8.2)
64
+ mini_portile2 (2.1.0)
65
+ minitest (5.9.1)
66
+ multipart-post (2.0.0)
67
+ nokogiri (1.6.8.1)
68
+ mini_portile2 (~> 2.1.0)
69
+ pg (0.19.0)
70
+ rack (2.0.1)
71
+ rack-test (0.6.3)
72
+ rack (>= 1.0)
73
+ rails-dom-testing (2.0.1)
74
+ activesupport (>= 4.2.0, < 6.0)
75
+ nokogiri (~> 1.6.0)
76
+ rails-html-sanitizer (1.0.3)
77
+ loofah (~> 2.0)
78
+ railties (5.0.0.1)
79
+ actionpack (= 5.0.0.1)
80
+ activesupport (= 5.0.0.1)
81
+ method_source
82
+ rake (>= 0.8.7)
83
+ thor (>= 0.18.1, < 2.0)
84
+ rake (11.3.0)
85
+ songkick_queue (1.0.0)
86
+ activesupport (>= 3.0.0)
87
+ bunny (~> 2.2)
88
+ thor (0.19.1)
89
+ thread_safe (0.3.5)
90
+ timers (4.1.1)
91
+ hitimes
92
+ typhoeus (0.8.0)
93
+ ethon (>= 0.8.0)
94
+ tzinfo (1.2.2)
95
+ thread_safe (~> 0.1)
96
+ webrobots (0.1.2)
97
+
98
+ PLATFORMS
99
+ ruby
100
+
101
+ DEPENDENCIES
102
+ daimon_skycrawlers!
103
+ rake
104
+
105
+ BUNDLED WITH
106
+ 1.12.5
@@ -0,0 +1,50 @@
1
+ # itp-crawler
2
+
3
+ Simple crawler for [iタウンページ](http://itp.ne.jp)
4
+
5
+ ## Requirements
6
+
7
+ - Ruby
8
+ - RabbitMQ
9
+ - RDB
10
+ - PostgreSQL (default)
11
+ - MySQL
12
+ - SQLite3
13
+
14
+ ## Usage
15
+
16
+ 1. Install dependencies
17
+
18
+ ```
19
+ $ bundle install
20
+ ```
21
+
22
+ 2. Create database
23
+
24
+ ```
25
+ $ bundle exec rake db:create
26
+ $ bundle exec rake db:migrate
27
+ $ bundle exec rake itp:db:create
28
+ $ bundle exec rake itp:db:migrate
29
+ ```
30
+
31
+ 3. Open new terminal and run crawler/processor
32
+
33
+ ```
34
+ $ bundle exec daimon_skycrawlers exec crawler # on new terminal
35
+ $ bundle exec daimon_skycrawlers exec processor # on new terminal
36
+ ```
37
+
38
+ 4. Enqueue task
39
+
40
+ ```
41
+ $ bundle exec daimon_skycrawlers enqueue url "http://itp.ne.jp/osaka/genre_dir/niku/?num=50"
42
+ ```
43
+
44
+ 5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
45
+
46
+ 6. You can re-enqueue task for processor
47
+
48
+ ```
49
+ $ bundle exec daimon_skycrawlers enqueue response "http://itp.ne.jp/osaka/genre_dir/niku/?num=50"
50
+ ```
@@ -0,0 +1,2 @@
1
+ require "daimon_skycrawlers/tasks"
2
+ require_relative "./lib/tasks"
@@ -0,0 +1,8 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/crawler"
3
+ require "daimon_skycrawlers/crawler/default"
4
+
5
+
6
+ base_url = "http://itp.ne.jp/"
7
+ crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
8
+ DaimonSkycrawlers.register_crawler(crawler)
@@ -0,0 +1,5 @@
1
+ class ItpBase < ActiveRecord::Base
2
+ self.abstract_class = true
3
+ self.configurations = YAML.load_file("config/database_itp.yml")
4
+ self.establish_connection(DaimonSkycrawlers.env.to_sym)
5
+ end
@@ -0,0 +1,5 @@
1
+ require_relative "./itp_base"
2
+
3
+ class ItpShop < ItpBase
4
+ self.table_name = "shops"
5
+ end
@@ -0,0 +1,95 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/processor"
3
+ require "daimon_skycrawlers/processor/base"
4
+
5
+ require_relative "../models/itp_shop"
6
+
7
+ class ItpProcessor < DaimonSkycrawlers::Processor::Base
8
+ def call(message)
9
+ key_url = message[:url]
10
+ page = storage.find(key_url)
11
+ @doc = Nokogiri::HTML(page.body.encode("UTF-8", "CP932"))
12
+ ItpShop.transaction do
13
+ prepare_shops do |shop|
14
+ itp_shop = ItpShop.find_or_initialize_by(itp_url: shop.itp_url)
15
+ itp_shop.assign_attributes(shop.to_h)
16
+ itp_shop.save!
17
+ end
18
+ end
19
+ unless %r(/pg/) =~ key_url
20
+ enqueue_pages(key_url)
21
+ end
22
+ end
23
+
24
+ Shop = Struct.new(:name, :description, :itp_url, :zip_code, :address, :phone)
25
+
26
+ private
27
+
28
+ def prepare_shops
29
+ @doc.search(".normalResultsBox").each do |shop|
30
+ begin
31
+ name_element = shop.at("section h4 .blueText") || shop.at("section h4 .brackText")
32
+ name = name_element.content.strip
33
+ description = shop.at("section p").content.strip
34
+ # Avoid false detection for shop address
35
+ if description.start_with?("住所")
36
+ description = nil
37
+ end
38
+ itp_path = shop.at("section h4 a").attr("href")
39
+ phone = shop.at("section p b").content.strip
40
+ address_element = shop.search("section p").detect do |element|
41
+ /住所/ =~ element.content
42
+ end
43
+ address_element.search("span").unlink
44
+ address_element.search("a").unlink
45
+ address_text = address_element.content.strip
46
+ zip_code = address_text.slice(/〒(\d{3}-\d{4})(.+)/, 1)
47
+ address = address_text.slice(/〒(\d{3}-\d{4})(.+)/, 2).sub(/\A[[:space:]]+/, "")
48
+ s = Shop.new(name,
49
+ description,
50
+ retrieve_individual_page_url(itp_path),
51
+ zip_code,
52
+ address,
53
+ phone)
54
+ yield s
55
+ rescue => e
56
+ log.warn("#{e.class}: #{e.message}")
57
+ log.debug(e.backtrace.join("\n"))
58
+ break
59
+ end
60
+ end
61
+ end
62
+
63
+ # NOTE: HEAD request to itp.ne.jp is so slow
64
+ #
65
+ # /shop/KN2700060500184274/?url=%2F0663026300%2F&s_bid=KN2700060500184274&s_sid=FSP-LSR-002&s_fr=V01&s_ck=C01
66
+ # http://nttbj.itp.ne.jp/0663026300/index.html
67
+ #
68
+ # or
69
+ #
70
+ # /shop/KN2700060500039708/
71
+ # http://itp.ne.jp/shop/KN2700060500039708/
72
+ def retrieve_individual_page_url(path)
73
+ shop_id = path.slice(/\/\?url=(.+)&/, 1)
74
+ uri = if shop_id
75
+ URI("http://nttbj.itp.ne.jp/") + URI.unescape(shop_id) + "index.html"
76
+ else
77
+ URI("http://itp.ne.jp/") + path
78
+ end
79
+ uri.to_s
80
+ end
81
+
82
+ MAX_PAGE_NUM = 100
83
+
84
+ def enqueue_pages(base_url)
85
+ search_result = @doc.at("h1.searchResultHeader").content.strip.slice(/(\d+)件/, 1).to_i
86
+ # itp.ne.jp can displays 5000 search results.
87
+ 2.upto([(search_result / 50), MAX_PAGE_NUM].min) do |n|
88
+ url = URI.join(base_url, "pg/#{n}/?num=50")
89
+ DaimonSkycrawlers::Crawler.enqueue_url(url.to_s)
90
+ end
91
+ end
92
+ end
93
+
94
+ processor = ItpProcessor.new
95
+ DaimonSkycrawlers.register_processor(processor)
@@ -0,0 +1,26 @@
1
+ # PostgreSQL. Versions 8.2 and up are supported.
2
+ #
3
+ default: &default
4
+ adapter: postgresql
5
+ encoding: unicode
6
+ pool: 5
7
+
8
+ development:
9
+ <<: *default
10
+ database: itp-crawler_development
11
+ #username: itp-crawler
12
+ #password:
13
+ #host: localhost
14
+ #port: 5432
15
+ #schema_search_path: myapp,sharedapp,public
16
+ #min_messages: notice
17
+
18
+ test:
19
+ <<: *default
20
+ database: itp-crawler_test
21
+
22
+ production:
23
+ <<: *default
24
+ database: itp-crawler_production
25
+ username: itp-crawler
26
+ password: <%= ENV['ITP-CRAWLER_PASSWORD'] %>
@@ -0,0 +1,26 @@
1
+ # PostgreSQL. Versions 8.2 and up are supported.
2
+ #
3
+ default: &default
4
+ adapter: postgresql
5
+ encoding: unicode
6
+ pool: 5
7
+
8
+ development:
9
+ <<: *default
10
+ database: itp-processor_development
11
+ #username: itp-processor
12
+ #password:
13
+ #host: localhost
14
+ #port: 5432
15
+ #schema_search_path: myapp,sharedapp,public
16
+ #min_messages: notice
17
+
18
+ test:
19
+ <<: *default
20
+ database: itp-processor_test
21
+
22
+ production:
23
+ <<: *default
24
+ database: itp-processor_production
25
+ username: itp-processor
26
+ password: <%= ENV['ITP-PROCESSOR_PASSWORD'] %>
@@ -0,0 +1,22 @@
1
+ require "bundler/setup"
2
+ require "daimon_skycrawlers"
3
+ require "daimon_skycrawlers/logger"
4
+ require "daimon_skycrawlers/queue"
5
+
6
+ DaimonSkycrawlers.configure do |config|
7
+ config.logger = DaimonSkycrawlers::Logger.default
8
+ config.crawler_interval = 1
9
+ config.shutdown_interval = 30
10
+ end
11
+
12
+ DaimonSkycrawlers::Queue.configure do |config|
13
+ # queue configuration
14
+ config.logger = DaimonSkycrawlers.configuration.logger
15
+ config.host = "127.0.0.1"
16
+ config.port = 5672
17
+ # config.username = 'guest'
18
+ # config.password = 'guest'
19
+ config.vhost = "/"
20
+ config.max_reconnect_attempts = 10
21
+ config.network_recovery_interval = 1.0
22
+ end
@@ -0,0 +1,13 @@
1
+ class CreatePage < ActiveRecord::Migration[5.0]
2
+ def change
3
+ create_table :pages do |t|
4
+ t.string :url
5
+ t.text :headers
6
+ t.binary :body
7
+ t.datetime :last_modified_at
8
+ t.string :etag
9
+
10
+ t.timestamps
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,28 @@
1
+ # This file is auto-generated from the current state of the database. Instead
2
+ # of editing this file, please use the migrations feature of Active Record to
3
+ # incrementally modify your database, and then regenerate this schema definition.
4
+ #
5
+ # Note that this schema.rb definition is the authoritative source for your
6
+ # database schema. If you need to create the application database on another
7
+ # system, you should be using db:schema:load, not running all the migrations
8
+ # from scratch. The latter is a flawed and unsustainable approach (the more migrations
9
+ # you'll amass, the slower it'll run and the greater likelihood for issues).
10
+ #
11
+ # It's strongly recommended that you check this file into your version control system.
12
+
13
+ ActiveRecord::Schema.define(version: 20161018044144) do
14
+
15
+ # These are extensions that must be enabled in order to support this database
16
+ enable_extension "plpgsql"
17
+
18
+ create_table "pages", force: :cascade do |t|
19
+ t.string "url"
20
+ t.text "headers"
21
+ t.binary "body"
22
+ t.datetime "last_modified_at"
23
+ t.string "etag"
24
+ t.datetime "created_at", null: false
25
+ t.datetime "updated_at", null: false
26
+ end
27
+
28
+ end
@@ -0,0 +1,14 @@
1
+ class CreateShop < ActiveRecord::Migration[5.0]
2
+ def change
3
+ create_table :shops do |t|
4
+ t.string :name, index: true
5
+ t.text :description
6
+ t.string :itp_url, index: true
7
+ t.string :zip_code
8
+ t.string :address
9
+ t.string :phone
10
+
11
+ t.timestamps
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,31 @@
1
+ # This file is auto-generated from the current state of the database. Instead
2
+ # of editing this file, please use the migrations feature of Active Record to
3
+ # incrementally modify your database, and then regenerate this schema definition.
4
+ #
5
+ # Note that this schema.rb definition is the authoritative source for your
6
+ # database schema. If you need to create the application database on another
7
+ # system, you should be using db:schema:load, not running all the migrations
8
+ # from scratch. The latter is a flawed and unsustainable approach (the more migrations
9
+ # you'll amass, the slower it'll run and the greater likelihood for issues).
10
+ #
11
+ # It's strongly recommended that you check this file into your version control system.
12
+
13
+ ActiveRecord::Schema.define(version: 20161020044144) do
14
+
15
+ # These are extensions that must be enabled in order to support this database
16
+ enable_extension "plpgsql"
17
+
18
+ create_table "shops", force: :cascade do |t|
19
+ t.string "name"
20
+ t.text "description"
21
+ t.string "itp_url"
22
+ t.string "zip_code"
23
+ t.string "address"
24
+ t.string "phone"
25
+ t.datetime "created_at", null: false
26
+ t.datetime "updated_at", null: false
27
+ t.index ["itp_url"], name: "index_shops_on_itp_url", using: :btree
28
+ t.index ["name"], name: "index_shops_on_name", using: :btree
29
+ end
30
+
31
+ end
@@ -0,0 +1,66 @@
1
+ require "daimon_skycrawlers"
2
+
3
+ seed_loader = Class.new do
4
+ def load_seed
5
+ # load "#{ActiveRecord::Tasks::DatabaseTasks.db_dir}/seeds.rb"
6
+ end
7
+ end
8
+
9
+ namespace :itp do
10
+ namespace :db do |ns|
11
+ task :drop => [:load_config] do
12
+ ActiveRecord::Tasks::DatabaseTasks.drop_current
13
+ end
14
+
15
+ task :create => [:load_config] do
16
+ ActiveRecord::Tasks::DatabaseTasks.create_current
17
+ end
18
+
19
+ task :migrate => [:load_config] do
20
+ ActiveRecord::Tasks::DatabaseTasks.migrate
21
+ ns["_dump"].invoke
22
+ end
23
+
24
+ task :_dump do
25
+ ns["schema:dump"].invoke
26
+ ns["_dump"].reenable
27
+ end
28
+
29
+ task :version => [:load_config] do
30
+ puts "Current version: #{ActiveRecord::Migrator.current_version}"
31
+ end
32
+
33
+ namespace :migrate do
34
+ task :reset => ["db:drop", "db:create", "db:migrate"]
35
+ end
36
+
37
+ namespace :schema do
38
+ task :dump => [:load_config] do
39
+ require "active_record/schema_dumper"
40
+ filename = ENV["SCHEMA"] || File.join(ActiveRecord::Tasks::DatabaseTasks.db_dir, "schema.rb")
41
+ File.open(filename, "w:utf-8") do |file|
42
+ ActiveRecord::SchemaDumper.dump(ActiveRecord::Base.connection, file)
43
+ end
44
+ ns["schema:dump"].reenable
45
+ end
46
+
47
+ task :load => [:load_config] do
48
+ ActiveRecord::Tasks::DatabaseTasks.load_schema_current(:ruby, ENV["SCHEMA"])
49
+ end
50
+ end
51
+
52
+ task :load_config do
53
+ ActiveRecord::Tasks::DatabaseTasks.tap do |config|
54
+ config.root = Rake.application.original_dir
55
+ config.env = ENV["SKYCRAWLERS_ENV"] || "development"
56
+ config.db_dir = "db_itp"
57
+ config.migrations_paths = ["db_itp/migrate"]
58
+ config.fixtures_path = "test/fixtures"
59
+ config.seed_loader = seed_loader.new
60
+ config.database_configuration = YAML.load_file("config/database_itp.yml")
61
+ end
62
+ ActiveRecord::Base.configurations = YAML.load_file("config/database_itp.yml")
63
+ ActiveRecord::Base.establish_connection(DaimonSkycrawlers.env.to_sym)
64
+ end
65
+ end
66
+ end
@@ -0,0 +1 @@
1
+ require_relative "./tasks/database_tasks"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryunosuke SATO
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-18 00:00:00.000000000 Z
11
+ date: 2016-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -151,7 +151,7 @@ dependencies:
151
151
  - !ruby/object:Gem::Version
152
152
  version: '0'
153
153
  - !ruby/object:Gem::Dependency
154
- name: sitemap-parser
154
+ name: typhoeus
155
155
  requirement: !ruby/object:Gem::Requirement
156
156
  requirements:
157
157
  - - ">="
@@ -343,6 +343,7 @@ files:
343
343
  - lib/daimon_skycrawlers/processor/default.rb
344
344
  - lib/daimon_skycrawlers/processor/spider.rb
345
345
  - lib/daimon_skycrawlers/queue.rb
346
+ - lib/daimon_skycrawlers/sitemap_parser.rb
346
347
  - lib/daimon_skycrawlers/storage.rb
347
348
  - lib/daimon_skycrawlers/storage/base.rb
348
349
  - lib/daimon_skycrawlers/storage/file.rb
@@ -352,6 +353,23 @@ files:
352
353
  - lib/daimon_skycrawlers/tasks/database_tasks.rake
353
354
  - lib/daimon_skycrawlers/timer.rb
354
355
  - lib/daimon_skycrawlers/version.rb
356
+ - sample/itp-crawler/Gemfile
357
+ - sample/itp-crawler/Gemfile.lock
358
+ - sample/itp-crawler/README.md
359
+ - sample/itp-crawler/Rakefile
360
+ - sample/itp-crawler/app/crawlers/itp_crawler.rb
361
+ - sample/itp-crawler/app/models/itp_base.rb
362
+ - sample/itp-crawler/app/models/itp_shop.rb
363
+ - sample/itp-crawler/app/processors/itp_processor.rb
364
+ - sample/itp-crawler/config/database.yml
365
+ - sample/itp-crawler/config/database_itp.yml
366
+ - sample/itp-crawler/config/init.rb
367
+ - sample/itp-crawler/db/migrate/20161018044144_create_page.rb
368
+ - sample/itp-crawler/db/schema.rb
369
+ - sample/itp-crawler/db_itp/migrate/20161020044144_create_shop.rb
370
+ - sample/itp-crawler/db_itp/schema.rb
371
+ - sample/itp-crawler/lib/tasks.rb
372
+ - sample/itp-crawler/lib/tasks/database_tasks.rb
355
373
  - sample/spider/Gemfile
356
374
  - sample/spider/README.md
357
375
  - sample/spider/Rakefile