daimon_skycrawlers 0.6.0 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1658e1827b07bfebb54e4d84b3a9dd21f11edd66
4
- data.tar.gz: 5abf3dc3cbb8d8423b8f110e2c805ca1d919f59c
3
+ metadata.gz: e81cffca8abac23dba0da166eb2a1f6cef80b2c1
4
+ data.tar.gz: d4948c33c37ee8d9508d93c037919a784a2d21cc
5
5
  SHA512:
6
- metadata.gz: 16ff307268fa11cc34454808f8bdbefded16a325e043c349ae07e48758fa8dcaae69457bbd7435e2685fa7026d8848677513780f9e1d7b42352acd54f5f33cdb
7
- data.tar.gz: 841d61776c8094e6d1eebb149e72dc42a1c9a71fdc453c34f06dea465a5838c37db633bed7f79a93c3beb8827e05ca549fc3e8cd0cd8e3f9e17b52e349f5c787
6
+ metadata.gz: 4840d12acec8c75a13330810029a32c62827e35a5ca871b06dec0b1f80e7be9b8dd9fa08208a4b0daac4cd527a32ba1bdc1c893c815834ebbfe22750fd440997
7
+ data.tar.gz: 072f215aa3f245445c9ca335651fe875c238039978f3cf0eac6bc873b0d34b1c1dd4626cfa2320ce4ac5e9b1defe05ffba8ec63dcf9e5c8197472c2fa5cd9cea
@@ -28,7 +28,7 @@ Gem::Specification.new do |spec|
28
28
  spec.add_dependency "railties"
29
29
  spec.add_dependency "pg"
30
30
  spec.add_dependency "timers"
31
- spec.add_dependency "sitemap-parser"
31
+ spec.add_dependency "typhoeus"
32
32
  spec.add_dependency "webrobots"
33
33
 
34
34
  spec.add_development_dependency "rake", "~> 10.0"
@@ -1,8 +1,8 @@
1
1
  require "daimon_skycrawlers"
2
2
  require "daimon_skycrawlers/crawler"
3
3
  require "daimon_skycrawlers/processor"
4
+ require "daimon_skycrawlers/sitemap_parser"
4
5
  require "daimon_skycrawlers/version"
5
- require "sitemap-parser"
6
6
  require "webrobots"
7
7
 
8
8
  module DaimonSkycrawlers
@@ -27,6 +27,7 @@ module DaimonSkycrawlers
27
27
  desc "sitemap [OPTIONS] URL", "Enqueue URLs from simtemap.xml"
28
28
  method_option("robots-txt", aliases: ["-r"], type: :boolean,
29
29
  desc: "URL for robots.txt. Detect robots.txt automatically if URL is not robots.txt")
30
+ method_option("dump", type: :boolean, desc: "Dump URLs without enqueue")
30
31
  def sitemap(url)
31
32
  load_init
32
33
  if options["robots-txt"]
@@ -35,9 +36,11 @@ module DaimonSkycrawlers
35
36
  else
36
37
  sitemaps = [url]
37
38
  end
38
- urls = sitemaps.flat_map do |sitemap|
39
- sitemap_parser = SitemapParser.new(sitemap)
40
- sitemap_parser.to_a
39
+ sitemap_parser = DaimonSkycrawlers::SitemapParser.new(sitemaps)
40
+ urls = sitemap_parser.parse
41
+ if options["dump"]
42
+ puts urls.join("\n")
43
+ return
41
44
  end
42
45
  urls.each do |_url|
43
46
  DaimonSkycrawlers::Crawler.enqueue_url(_url)
@@ -5,6 +5,8 @@ require "daimon_skycrawlers/logger"
5
5
  require "daimon_skycrawlers/config"
6
6
  require "daimon_skycrawlers/storage"
7
7
  require "daimon_skycrawlers/processor"
8
+ require "daimon_skycrawlers/filter/update_checker"
9
+ require "daimon_skycrawlers/filter/robots_txt_checker"
8
10
 
9
11
  module DaimonSkycrawlers
10
12
  module Crawler
@@ -1,6 +1,4 @@
1
1
  require "daimon_skycrawlers/crawler/base"
2
- require "daimon_skycrawlers/filter/update_checker"
3
- require "daimon_skycrawlers/filter/robots_txt_checker"
4
2
 
5
3
  module DaimonSkycrawlers
6
4
  module Crawler
@@ -24,6 +24,7 @@ module DaimonSkycrawlers
24
24
  # @return [void]
25
25
  def enqueue_url(url, message = {})
26
26
  message[:url] = url
27
+ config.logger.debug("#{queue_name}: #{url}")
27
28
  SongkickQueue.publish(queue_name, message)
28
29
  end
29
30
 
@@ -6,24 +6,51 @@ require "daimon_skycrawlers/consumer/http_response"
6
6
  module DaimonSkycrawlers
7
7
  module Processor
8
8
  class << self
9
+ #
10
+ # Run registered processors
11
+ #
12
+ # @param process_name [String] Process name
13
+ #
9
14
  def run(process_name: default_process_name)
10
15
  DaimonSkycrawlers::Timer.setup_shutdown_timer(config.queue_name_prefix, interval: config.shutdown_interval)
11
16
  SongkickQueue::Worker.new(process_name, [DaimonSkycrawlers::Consumer::HTTPResponse]).run
12
17
  end
13
18
 
19
+ #
20
+ # Enqueue a URL to processor queue
21
+ #
22
+ # @param [String] Specify absolute URL
23
+ # @param [Hash] Extra parameters for crawler
24
+ # @return [void]
14
25
  def enqueue_http_response(url, message = {})
15
26
  message[:url] = url
27
+ config.logger.debug("#{queue_name}: #{url}")
16
28
  SongkickQueue.publish(queue_name, message)
17
29
  end
18
30
 
31
+ #
32
+ # Shortcut of DaimonSkycrawlers.configuration
33
+ #
34
+ # @return [DaimonSkycrawlers::Configuration]
35
+ #
19
36
  def config
20
37
  DaimonSkycrawlers.configuration
21
38
  end
22
39
 
40
+ #
41
+ # Queue name for processor
42
+ #
43
+ # @return [String] Queue name
44
+ #
23
45
  def queue_name
24
46
  "#{config.queue_name_prefix}.http-response"
25
47
  end
26
48
 
49
+ #
50
+ # Default process name
51
+ #
52
+ # @return [String] Default process name
53
+ #
27
54
  def default_process_name
28
55
  "#{config.queue_name_prefix}:http-response"
29
56
  end
@@ -0,0 +1,79 @@
1
+ require "nokogiri"
2
+ require "typhoeus"
3
+ require "zlib"
4
+ require "uri"
5
+
6
+ module DaimonSkycrawlers
7
+ # Based on https://github.com/benbalter/sitemap-parser
8
+ class SitemapParser
9
+ def initialize(urls, options = {})
10
+ @urls = urls
11
+ end
12
+
13
+ def parse
14
+ hydra = Typhoeus::Hydra.new(max_concurrency: 1)
15
+ sitemap_urls = []
16
+ @urls.each do |url|
17
+ if URI(url).scheme.start_with?("http")
18
+ request = Typhoeus::Request.new(url, followlocation: true)
19
+ request.on_complete do |response|
20
+ sitemap_urls.concat(on_complete(response))
21
+ end
22
+ hydra.queue(request)
23
+ else
24
+ if File.exist?(url)
25
+ extract_urls(File.read(url))
26
+ end
27
+ end
28
+ end
29
+ hydra.run
30
+ sitemap_urls
31
+ end
32
+
33
+ private
34
+
35
+ def on_complete(response)
36
+ raise "HTTP requset to #{response.effective_url} failed" unless response.success?
37
+ raw_sitemap = inflate_response(response)
38
+ extract_urls(raw_sitemap)
39
+ end
40
+
41
+ def extract_urls(body)
42
+ sitemap = Nokogiri::XML(body)
43
+ case
44
+ when sitemap.at("sitemapindex")
45
+ urls = sitemap.search("sitemap").flat_map do |s|
46
+ s.at("loc").content
47
+ end
48
+ SitemapParser.new(urls).parse
49
+ when sitemap.at("urlset")
50
+ sitemap.search("url").flat_map do |url|
51
+ url.at("loc").content
52
+ end
53
+ else
54
+ raise "Malformed sitemap.xml no <sitemapindex> or <urlset>"
55
+ end
56
+ end
57
+
58
+ def inflate_response(response)
59
+ if compressed?(response)
60
+ # We cannot inflate compressed data from NTFS filesystem (NT).
61
+ # This can avoid errors
62
+ stream = Zlib::Inflate.new(Zlib::MAX_WBITS + 32)
63
+ stream.inflate(response.body)
64
+ else
65
+ response.body
66
+ end
67
+ end
68
+
69
+ def compressed?(response)
70
+ case response.headers["Content-Encoding"]&.downcase
71
+ when "deflate", "gzip", "x-gzip"
72
+ true
73
+ else
74
+ signature = response.body[0, 2]
75
+ signature == "\x1F\x8B".b
76
+ end
77
+ end
78
+ end
79
+ end
@@ -9,9 +9,8 @@ module DaimonSkycrawlers
9
9
  class RDB < Base
10
10
  def initialize(config_path = "config/database.yml")
11
11
  super()
12
- config = YAML.load_file(config_path)
13
- environment = ENV["SKYCRAWLERS_ENV"] || "development"
14
- ActiveRecord::Base.establish_connection(config[environment])
12
+ Base.configurations = YAML.load_file(config_path)
13
+ Base.establish_connection(DaimonSkycrawlers.env.to_sym)
15
14
  end
16
15
 
17
16
  #
@@ -38,7 +37,11 @@ module DaimonSkycrawlers
38
37
  Page.where(url: url).order(last_modified_at: :desc).limit(1).first
39
38
  end
40
39
 
41
- class Page < ActiveRecord::Base
40
+ class Base < ActiveRecord::Base
41
+ self.abstract_class = true
42
+ end
43
+
44
+ class Page < Base
42
45
  self.table_name = "pages"
43
46
  end
44
47
  end
@@ -30,24 +30,23 @@ seed_loader = Class.new do
30
30
  end
31
31
  end
32
32
 
33
- ActiveRecord::Tasks::DatabaseTasks.tap do |config|
34
- config.root = Rake.application.original_dir
35
- config.env = ENV["SKYCRAWLERS_ENV"] || "development"
36
- config.db_dir = "db"
37
- config.migrations_paths = ["db/migrate"]
38
- config.fixtures_path = "test/fixtures"
39
- config.seed_loader = seed_loader.new
40
- config.database_configuration = ActiveRecord::Base.configurations
41
- end
42
-
43
33
  # db:load_config can be overriden manually
44
34
  Rake::Task["db:seed"].enhance(["db:load_config"])
45
35
  Rake::Task["db:load_config"].clear
46
36
 
47
37
  Rake::Task.define_task("db:environment")
48
38
  Rake::Task.define_task("db:load_config") do
49
- ActiveRecord::Base.configurations = YAML.load_file("config/database.yml")
39
+ ActiveRecord::Tasks::DatabaseTasks.tap do |config|
40
+ config.root = Rake.application.original_dir
41
+ config.env = ENV["SKYCRAWLERS_ENV"] || "development"
42
+ config.db_dir = "db"
43
+ config.migrations_paths = ["db/migrate"]
44
+ config.fixtures_path = "test/fixtures"
45
+ config.seed_loader = seed_loader.new
46
+ config.database_configuration = YAML.load_file("config/database.yml")
47
+ end
50
48
  environment = ENV["SKYCRAWLERS_ENV"] || "development"
49
+ ActiveRecord::Base.configurations = ActiveRecord::Tasks::DatabaseTasks.database_configuration
51
50
  ActiveRecord::Base.establish_connection(environment.to_sym)
52
51
  end
53
52
  Rake::Task["db:test:deprecated"].clear if Rake::Task.task_defined?("db:test:deprecated")
@@ -1,3 +1,3 @@
1
1
  module DaimonSkycrawlers
2
- VERSION = "0.6.0"
2
+ VERSION = "0.7.0"
3
3
  end
@@ -74,5 +74,12 @@ module DaimonSkycrawlers
74
74
  puts ex.message
75
75
  exit(false)
76
76
  end
77
+
78
+ #
79
+ # Return current environment
80
+ #
81
+ def env
82
+ ENV["SKYCRAWLERS_ENV"] || "development"
83
+ end
77
84
  end
78
85
  end
@@ -0,0 +1,4 @@
1
+ source "https://rubygems.org"
2
+
3
+ gem "rake"
4
+ gem "daimon_skycrawlers", path: "../../"
@@ -0,0 +1,106 @@
1
+ PATH
2
+ remote: ../../
3
+ specs:
4
+ daimon_skycrawlers (0.6.0)
5
+ activerecord
6
+ bundler (~> 1.11)
7
+ faraday
8
+ faraday_middleware
9
+ nokogiri
10
+ pg
11
+ railties
12
+ songkick_queue
13
+ thor
14
+ timers
15
+ typhoeus
16
+ webrobots
17
+
18
+ GEM
19
+ remote: https://rubygems.org/
20
+ specs:
21
+ actionpack (5.0.0.1)
22
+ actionview (= 5.0.0.1)
23
+ activesupport (= 5.0.0.1)
24
+ rack (~> 2.0)
25
+ rack-test (~> 0.6.3)
26
+ rails-dom-testing (~> 2.0)
27
+ rails-html-sanitizer (~> 1.0, >= 1.0.2)
28
+ actionview (5.0.0.1)
29
+ activesupport (= 5.0.0.1)
30
+ builder (~> 3.1)
31
+ erubis (~> 2.7.0)
32
+ rails-dom-testing (~> 2.0)
33
+ rails-html-sanitizer (~> 1.0, >= 1.0.2)
34
+ activemodel (5.0.0.1)
35
+ activesupport (= 5.0.0.1)
36
+ activerecord (5.0.0.1)
37
+ activemodel (= 5.0.0.1)
38
+ activesupport (= 5.0.0.1)
39
+ arel (~> 7.0)
40
+ activesupport (5.0.0.1)
41
+ concurrent-ruby (~> 1.0, >= 1.0.2)
42
+ i18n (~> 0.7)
43
+ minitest (~> 5.1)
44
+ tzinfo (~> 1.1)
45
+ amq-protocol (2.0.1)
46
+ arel (7.1.4)
47
+ builder (3.2.2)
48
+ bunny (2.6.0)
49
+ amq-protocol (>= 2.0.1)
50
+ concurrent-ruby (1.0.2)
51
+ erubis (2.7.0)
52
+ ethon (0.9.1)
53
+ ffi (>= 1.3.0)
54
+ faraday (0.9.2)
55
+ multipart-post (>= 1.2, < 3)
56
+ faraday_middleware (0.10.0)
57
+ faraday (>= 0.7.4, < 0.10)
58
+ ffi (1.9.14)
59
+ hitimes (1.2.4)
60
+ i18n (0.7.0)
61
+ loofah (2.0.3)
62
+ nokogiri (>= 1.5.9)
63
+ method_source (0.8.2)
64
+ mini_portile2 (2.1.0)
65
+ minitest (5.9.1)
66
+ multipart-post (2.0.0)
67
+ nokogiri (1.6.8.1)
68
+ mini_portile2 (~> 2.1.0)
69
+ pg (0.19.0)
70
+ rack (2.0.1)
71
+ rack-test (0.6.3)
72
+ rack (>= 1.0)
73
+ rails-dom-testing (2.0.1)
74
+ activesupport (>= 4.2.0, < 6.0)
75
+ nokogiri (~> 1.6.0)
76
+ rails-html-sanitizer (1.0.3)
77
+ loofah (~> 2.0)
78
+ railties (5.0.0.1)
79
+ actionpack (= 5.0.0.1)
80
+ activesupport (= 5.0.0.1)
81
+ method_source
82
+ rake (>= 0.8.7)
83
+ thor (>= 0.18.1, < 2.0)
84
+ rake (11.3.0)
85
+ songkick_queue (1.0.0)
86
+ activesupport (>= 3.0.0)
87
+ bunny (~> 2.2)
88
+ thor (0.19.1)
89
+ thread_safe (0.3.5)
90
+ timers (4.1.1)
91
+ hitimes
92
+ typhoeus (0.8.0)
93
+ ethon (>= 0.8.0)
94
+ tzinfo (1.2.2)
95
+ thread_safe (~> 0.1)
96
+ webrobots (0.1.2)
97
+
98
+ PLATFORMS
99
+ ruby
100
+
101
+ DEPENDENCIES
102
+ daimon_skycrawlers!
103
+ rake
104
+
105
+ BUNDLED WITH
106
+ 1.12.5
@@ -0,0 +1,50 @@
1
+ # itp-crawler
2
+
3
+ Simple crawler for [iタウンページ](http://itp.ne.jp)
4
+
5
+ ## Requirements
6
+
7
+ - Ruby
8
+ - RabbitMQ
9
+ - RDB
10
+ - PostgreSQL (default)
11
+ - MySQL
12
+ - SQLite3
13
+
14
+ ## Usage
15
+
16
+ 1. Install dependencies
17
+
18
+ ```
19
+ $ bundle install
20
+ ```
21
+
22
+ 2. Create database
23
+
24
+ ```
25
+ $ bundle exec rake db:create
26
+ $ bundle exec rake db:migrate
27
+ $ bundle exec rake itp:db:create
28
+ $ bundle exec rake itp:db:migrate
29
+ ```
30
+
31
+ 3. Open new terminal and run crawler/processor
32
+
33
+ ```
34
+ $ bundle exec daimon_skycrawlers exec crawler # on new terminal
35
+ $ bundle exec daimon_skycrawlers exec processor # on new terminal
36
+ ```
37
+
38
+ 4. Enqueue task
39
+
40
+ ```
41
+ $ bundle exec daimon_skycrawlers enqueue url "http://itp.ne.jp/osaka/genre_dir/niku/?num=50"
42
+ ```
43
+
44
+ 5. You'll see `It works with 'http://example.com'` on your terminal which runs your processor!
45
+
46
+ 6. You can re-enqueue task for processor
47
+
48
+ ```
49
+ $ bundle exec daimon_skycrawlers enqueue response "http://itp.ne.jp/osaka/genre_dir/niku/?num=50"
50
+ ```
@@ -0,0 +1,2 @@
1
+ require "daimon_skycrawlers/tasks"
2
+ require_relative "./lib/tasks"
@@ -0,0 +1,8 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/crawler"
3
+ require "daimon_skycrawlers/crawler/default"
4
+
5
+
6
+ base_url = "http://itp.ne.jp/"
7
+ crawler = DaimonSkycrawlers::Crawler::Default.new(base_url)
8
+ DaimonSkycrawlers.register_crawler(crawler)
@@ -0,0 +1,5 @@
1
+ class ItpBase < ActiveRecord::Base
2
+ self.abstract_class = true
3
+ self.configurations = YAML.load_file("config/database_itp.yml")
4
+ self.establish_connection(DaimonSkycrawlers.env.to_sym)
5
+ end
@@ -0,0 +1,5 @@
1
+ require_relative "./itp_base"
2
+
3
+ class ItpShop < ItpBase
4
+ self.table_name = "shops"
5
+ end
@@ -0,0 +1,95 @@
1
+ require "daimon_skycrawlers"
2
+ require "daimon_skycrawlers/processor"
3
+ require "daimon_skycrawlers/processor/base"
4
+
5
+ require_relative "../models/itp_shop"
6
+
7
+ class ItpProcessor < DaimonSkycrawlers::Processor::Base
8
+ def call(message)
9
+ key_url = message[:url]
10
+ page = storage.find(key_url)
11
+ @doc = Nokogiri::HTML(page.body.encode("UTF-8", "CP932"))
12
+ ItpShop.transaction do
13
+ prepare_shops do |shop|
14
+ itp_shop = ItpShop.find_or_initialize_by(itp_url: shop.itp_url)
15
+ itp_shop.assign_attributes(shop.to_h)
16
+ itp_shop.save!
17
+ end
18
+ end
19
+ unless %r(/pg/) =~ key_url
20
+ enqueue_pages(key_url)
21
+ end
22
+ end
23
+
24
+ Shop = Struct.new(:name, :description, :itp_url, :zip_code, :address, :phone)
25
+
26
+ private
27
+
28
+ def prepare_shops
29
+ @doc.search(".normalResultsBox").each do |shop|
30
+ begin
31
+ name_element = shop.at("section h4 .blueText") || shop.at("section h4 .brackText")
32
+ name = name_element.content.strip
33
+ description = shop.at("section p").content.strip
34
+ # Avoid false detection for shop address
35
+ if description.start_with?("住所")
36
+ description = nil
37
+ end
38
+ itp_path = shop.at("section h4 a").attr("href")
39
+ phone = shop.at("section p b").content.strip
40
+ address_element = shop.search("section p").detect do |element|
41
+ /住所/ =~ element.content
42
+ end
43
+ address_element.search("span").unlink
44
+ address_element.search("a").unlink
45
+ address_text = address_element.content.strip
46
+ zip_code = address_text.slice(/〒(\d{3}-\d{4})(.+)/, 1)
47
+ address = address_text.slice(/〒(\d{3}-\d{4})(.+)/, 2).sub(/\A[[:space:]]+/, "")
48
+ s = Shop.new(name,
49
+ description,
50
+ retrieve_individual_page_url(itp_path),
51
+ zip_code,
52
+ address,
53
+ phone)
54
+ yield s
55
+ rescue => e
56
+ log.warn("#{e.class}: #{e.message}")
57
+ log.debug(e.backtrace.join("\n"))
58
+ break
59
+ end
60
+ end
61
+ end
62
+
63
+ # NOTE: HEAD request to itp.ne.jp is so slow
64
+ #
65
+ # /shop/KN2700060500184274/?url=%2F0663026300%2F&s_bid=KN2700060500184274&s_sid=FSP-LSR-002&s_fr=V01&s_ck=C01
66
+ # http://nttbj.itp.ne.jp/0663026300/index.html
67
+ #
68
+ # or
69
+ #
70
+ # /shop/KN2700060500039708/
71
+ # http://itp.ne.jp/shop/KN2700060500039708/
72
+ def retrieve_individual_page_url(path)
73
+ shop_id = path.slice(/\/\?url=(.+)&/, 1)
74
+ uri = if shop_id
75
+ URI("http://nttbj.itp.ne.jp/") + URI.unescape(shop_id) + "index.html"
76
+ else
77
+ URI("http://itp.ne.jp/") + path
78
+ end
79
+ uri.to_s
80
+ end
81
+
82
+ MAX_PAGE_NUM = 100
83
+
84
+ def enqueue_pages(base_url)
85
+ search_result = @doc.at("h1.searchResultHeader").content.strip.slice(/(\d+)件/, 1).to_i
86
+ # itp.ne.jp can displays 5000 search results.
87
+ 2.upto([(search_result / 50), MAX_PAGE_NUM].min) do |n|
88
+ url = URI.join(base_url, "pg/#{n}/?num=50")
89
+ DaimonSkycrawlers::Crawler.enqueue_url(url.to_s)
90
+ end
91
+ end
92
+ end
93
+
94
+ processor = ItpProcessor.new
95
+ DaimonSkycrawlers.register_processor(processor)
@@ -0,0 +1,26 @@
1
+ # PostgreSQL. Versions 8.2 and up are supported.
2
+ #
3
+ default: &default
4
+ adapter: postgresql
5
+ encoding: unicode
6
+ pool: 5
7
+
8
+ development:
9
+ <<: *default
10
+ database: itp-crawler_development
11
+ #username: itp-crawler
12
+ #password:
13
+ #host: localhost
14
+ #port: 5432
15
+ #schema_search_path: myapp,sharedapp,public
16
+ #min_messages: notice
17
+
18
+ test:
19
+ <<: *default
20
+ database: itp-crawler_test
21
+
22
+ production:
23
+ <<: *default
24
+ database: itp-crawler_production
25
+ username: itp-crawler
26
+ password: <%= ENV['ITP-CRAWLER_PASSWORD'] %>
@@ -0,0 +1,26 @@
1
+ # PostgreSQL. Versions 8.2 and up are supported.
2
+ #
3
+ default: &default
4
+ adapter: postgresql
5
+ encoding: unicode
6
+ pool: 5
7
+
8
+ development:
9
+ <<: *default
10
+ database: itp-processor_development
11
+ #username: itp-processor
12
+ #password:
13
+ #host: localhost
14
+ #port: 5432
15
+ #schema_search_path: myapp,sharedapp,public
16
+ #min_messages: notice
17
+
18
+ test:
19
+ <<: *default
20
+ database: itp-processor_test
21
+
22
+ production:
23
+ <<: *default
24
+ database: itp-processor_production
25
+ username: itp-processor
26
+ password: <%= ENV['ITP-PROCESSOR_PASSWORD'] %>
@@ -0,0 +1,22 @@
1
+ require "bundler/setup"
2
+ require "daimon_skycrawlers"
3
+ require "daimon_skycrawlers/logger"
4
+ require "daimon_skycrawlers/queue"
5
+
6
+ DaimonSkycrawlers.configure do |config|
7
+ config.logger = DaimonSkycrawlers::Logger.default
8
+ config.crawler_interval = 1
9
+ config.shutdown_interval = 30
10
+ end
11
+
12
+ DaimonSkycrawlers::Queue.configure do |config|
13
+ # queue configuration
14
+ config.logger = DaimonSkycrawlers.configuration.logger
15
+ config.host = "127.0.0.1"
16
+ config.port = 5672
17
+ # config.username = 'guest'
18
+ # config.password = 'guest'
19
+ config.vhost = "/"
20
+ config.max_reconnect_attempts = 10
21
+ config.network_recovery_interval = 1.0
22
+ end
@@ -0,0 +1,13 @@
1
+ class CreatePage < ActiveRecord::Migration[5.0]
2
+ def change
3
+ create_table :pages do |t|
4
+ t.string :url
5
+ t.text :headers
6
+ t.binary :body
7
+ t.datetime :last_modified_at
8
+ t.string :etag
9
+
10
+ t.timestamps
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,28 @@
1
+ # This file is auto-generated from the current state of the database. Instead
2
+ # of editing this file, please use the migrations feature of Active Record to
3
+ # incrementally modify your database, and then regenerate this schema definition.
4
+ #
5
+ # Note that this schema.rb definition is the authoritative source for your
6
+ # database schema. If you need to create the application database on another
7
+ # system, you should be using db:schema:load, not running all the migrations
8
+ # from scratch. The latter is a flawed and unsustainable approach (the more migrations
9
+ # you'll amass, the slower it'll run and the greater likelihood for issues).
10
+ #
11
+ # It's strongly recommended that you check this file into your version control system.
12
+
13
+ ActiveRecord::Schema.define(version: 20161018044144) do
14
+
15
+ # These are extensions that must be enabled in order to support this database
16
+ enable_extension "plpgsql"
17
+
18
+ create_table "pages", force: :cascade do |t|
19
+ t.string "url"
20
+ t.text "headers"
21
+ t.binary "body"
22
+ t.datetime "last_modified_at"
23
+ t.string "etag"
24
+ t.datetime "created_at", null: false
25
+ t.datetime "updated_at", null: false
26
+ end
27
+
28
+ end
@@ -0,0 +1,14 @@
1
+ class CreateShop < ActiveRecord::Migration[5.0]
2
+ def change
3
+ create_table :shops do |t|
4
+ t.string :name, index: true
5
+ t.text :description
6
+ t.string :itp_url, index: true
7
+ t.string :zip_code
8
+ t.string :address
9
+ t.string :phone
10
+
11
+ t.timestamps
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,31 @@
1
+ # This file is auto-generated from the current state of the database. Instead
2
+ # of editing this file, please use the migrations feature of Active Record to
3
+ # incrementally modify your database, and then regenerate this schema definition.
4
+ #
5
+ # Note that this schema.rb definition is the authoritative source for your
6
+ # database schema. If you need to create the application database on another
7
+ # system, you should be using db:schema:load, not running all the migrations
8
+ # from scratch. The latter is a flawed and unsustainable approach (the more migrations
9
+ # you'll amass, the slower it'll run and the greater likelihood for issues).
10
+ #
11
+ # It's strongly recommended that you check this file into your version control system.
12
+
13
+ ActiveRecord::Schema.define(version: 20161020044144) do
14
+
15
+ # These are extensions that must be enabled in order to support this database
16
+ enable_extension "plpgsql"
17
+
18
+ create_table "shops", force: :cascade do |t|
19
+ t.string "name"
20
+ t.text "description"
21
+ t.string "itp_url"
22
+ t.string "zip_code"
23
+ t.string "address"
24
+ t.string "phone"
25
+ t.datetime "created_at", null: false
26
+ t.datetime "updated_at", null: false
27
+ t.index ["itp_url"], name: "index_shops_on_itp_url", using: :btree
28
+ t.index ["name"], name: "index_shops_on_name", using: :btree
29
+ end
30
+
31
+ end
@@ -0,0 +1,66 @@
1
+ require "daimon_skycrawlers"
2
+
3
+ seed_loader = Class.new do
4
+ def load_seed
5
+ # load "#{ActiveRecord::Tasks::DatabaseTasks.db_dir}/seeds.rb"
6
+ end
7
+ end
8
+
9
+ namespace :itp do
10
+ namespace :db do |ns|
11
+ task :drop => [:load_config] do
12
+ ActiveRecord::Tasks::DatabaseTasks.drop_current
13
+ end
14
+
15
+ task :create => [:load_config] do
16
+ ActiveRecord::Tasks::DatabaseTasks.create_current
17
+ end
18
+
19
+ task :migrate => [:load_config] do
20
+ ActiveRecord::Tasks::DatabaseTasks.migrate
21
+ ns["_dump"].invoke
22
+ end
23
+
24
+ task :_dump do
25
+ ns["schema:dump"].invoke
26
+ ns["_dump"].reenable
27
+ end
28
+
29
+ task :version => [:load_config] do
30
+ puts "Current version: #{ActiveRecord::Migrator.current_version}"
31
+ end
32
+
33
+ namespace :migrate do
34
+ task :reset => ["db:drop", "db:create", "db:migrate"]
35
+ end
36
+
37
+ namespace :schema do
38
+ task :dump => [:load_config] do
39
+ require "active_record/schema_dumper"
40
+ filename = ENV["SCHEMA"] || File.join(ActiveRecord::Tasks::DatabaseTasks.db_dir, "schema.rb")
41
+ File.open(filename, "w:utf-8") do |file|
42
+ ActiveRecord::SchemaDumper.dump(ActiveRecord::Base.connection, file)
43
+ end
44
+ ns["schema:dump"].reenable
45
+ end
46
+
47
+ task :load => [:load_config] do
48
+ ActiveRecord::Tasks::DatabaseTasks.load_schema_current(:ruby, ENV["SCHEMA"])
49
+ end
50
+ end
51
+
52
+ task :load_config do
53
+ ActiveRecord::Tasks::DatabaseTasks.tap do |config|
54
+ config.root = Rake.application.original_dir
55
+ config.env = ENV["SKYCRAWLERS_ENV"] || "development"
56
+ config.db_dir = "db_itp"
57
+ config.migrations_paths = ["db_itp/migrate"]
58
+ config.fixtures_path = "test/fixtures"
59
+ config.seed_loader = seed_loader.new
60
+ config.database_configuration = YAML.load_file("config/database_itp.yml")
61
+ end
62
+ ActiveRecord::Base.configurations = YAML.load_file("config/database_itp.yml")
63
+ ActiveRecord::Base.establish_connection(DaimonSkycrawlers.env.to_sym)
64
+ end
65
+ end
66
+ end
@@ -0,0 +1 @@
1
+ require_relative "./tasks/database_tasks"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: daimon_skycrawlers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryunosuke SATO
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-10-18 00:00:00.000000000 Z
11
+ date: 2016-10-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -151,7 +151,7 @@ dependencies:
151
151
  - !ruby/object:Gem::Version
152
152
  version: '0'
153
153
  - !ruby/object:Gem::Dependency
154
- name: sitemap-parser
154
+ name: typhoeus
155
155
  requirement: !ruby/object:Gem::Requirement
156
156
  requirements:
157
157
  - - ">="
@@ -343,6 +343,7 @@ files:
343
343
  - lib/daimon_skycrawlers/processor/default.rb
344
344
  - lib/daimon_skycrawlers/processor/spider.rb
345
345
  - lib/daimon_skycrawlers/queue.rb
346
+ - lib/daimon_skycrawlers/sitemap_parser.rb
346
347
  - lib/daimon_skycrawlers/storage.rb
347
348
  - lib/daimon_skycrawlers/storage/base.rb
348
349
  - lib/daimon_skycrawlers/storage/file.rb
@@ -352,6 +353,23 @@ files:
352
353
  - lib/daimon_skycrawlers/tasks/database_tasks.rake
353
354
  - lib/daimon_skycrawlers/timer.rb
354
355
  - lib/daimon_skycrawlers/version.rb
356
+ - sample/itp-crawler/Gemfile
357
+ - sample/itp-crawler/Gemfile.lock
358
+ - sample/itp-crawler/README.md
359
+ - sample/itp-crawler/Rakefile
360
+ - sample/itp-crawler/app/crawlers/itp_crawler.rb
361
+ - sample/itp-crawler/app/models/itp_base.rb
362
+ - sample/itp-crawler/app/models/itp_shop.rb
363
+ - sample/itp-crawler/app/processors/itp_processor.rb
364
+ - sample/itp-crawler/config/database.yml
365
+ - sample/itp-crawler/config/database_itp.yml
366
+ - sample/itp-crawler/config/init.rb
367
+ - sample/itp-crawler/db/migrate/20161018044144_create_page.rb
368
+ - sample/itp-crawler/db/schema.rb
369
+ - sample/itp-crawler/db_itp/migrate/20161020044144_create_shop.rb
370
+ - sample/itp-crawler/db_itp/schema.rb
371
+ - sample/itp-crawler/lib/tasks.rb
372
+ - sample/itp-crawler/lib/tasks/database_tasks.rb
355
373
  - sample/spider/Gemfile
356
374
  - sample/spider/README.md
357
375
  - sample/spider/Rakefile