news_crawler 0.0.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/news_crawler +94 -0
- data/lib/news_crawler/autostart.rb +33 -0
- data/lib/news_crawler/config.rb +53 -0
- data/lib/news_crawler/crawler_module.rb +70 -0
- data/lib/news_crawler/default_config.yml +13 -0
- data/lib/news_crawler/default_sds.yml +1 -0
- data/lib/news_crawler/downloader.rb +112 -0
- data/lib/news_crawler/link_selector/same_domain_selector.rb +172 -0
- data/lib/news_crawler/nc_logger.rb +49 -0
- data/lib/news_crawler/storage/raw_data/mongo_storage.rb +77 -0
- data/lib/news_crawler/storage/raw_data/raw_data_engine.rb +67 -0
- data/lib/news_crawler/storage/raw_data.rb +74 -0
- data/lib/news_crawler/storage/url_queue/mongo_storage.rb +218 -0
- data/lib/news_crawler/storage/url_queue/url_queue_engine.rb +124 -0
- data/lib/news_crawler/storage/url_queue/url_queue_error.rb +28 -0
- data/lib/news_crawler/storage/url_queue.rb +150 -0
- data/lib/news_crawler/url_helper.rb +50 -0
- data/lib/news_crawler/utils/robots_patch.rb +34 -0
- data/lib/news_crawler.rb +47 -0
- metadata +203 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3cdcc66767575d17c8a1f5d25f00713f74d51e98
|
4
|
+
data.tar.gz: 7c7d2c838066a45c4365a0756625e4a08c5d4482
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 75f259b0cafbff494302955a1b7968d331d69298c8e5a31185b0b8288408f7af323721eb83a2043005e2acb7e12337051622e191356da497892eb35701dfe7d8
|
7
|
+
data.tar.gz: 4382b0ed8d4bc28134d8af322f9a2f46b95a8732b2629835ae0b49da4e3ecdaa6fcce671c7fd5adc8cc198fb8e107360678277767b9cc9d706bae7a6530b2dd4
|
data/bin/news_crawler
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
#--
|
5
|
+
# NewsCrawler - a website crawler
|
6
|
+
#
|
7
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
8
|
+
#
|
9
|
+
# This file is part of NewsCrawler.
|
10
|
+
#
|
11
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
12
|
+
# it under the terms of the GNU General Public License as published by
|
13
|
+
# the Free Software Foundation, either version 3 of the License, or
|
14
|
+
# (at your option) any later version.
|
15
|
+
#
|
16
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
17
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
# GNU General Public License for more details.
|
20
|
+
#
|
21
|
+
# You should have received a copy of the GNU General Public License
|
22
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
23
|
+
#++
|
24
|
+
|
25
|
+
require 'optparse'
|
26
|
+
require 'news_crawler/config'
|
27
|
+
require 'news_crawler/nc_logger'
|
28
|
+
|
29
|
+
require 'news_crawler/downloader'
|
30
|
+
require 'news_crawler/link_selector/same_domain_selector'
|
31
|
+
|
32
|
+
include NewsCrawler::Storage
|
33
|
+
|
34
|
+
options = {}
|
35
|
+
|
36
|
+
OptionParser.new do | opts |
|
37
|
+
opts.banner = "Usage: news_crawler [options] url"
|
38
|
+
|
39
|
+
opts.on('-c', "--app-conf FILE", "Application configuration file") do | f |
|
40
|
+
options[:app_conf] = File.expand_path(f)
|
41
|
+
raise Errno::ENOENT unless File.exists? options[:app_conf]
|
42
|
+
end
|
43
|
+
|
44
|
+
opts.on('-sds', "--sds-conf FILE", "Same domain selector configuration file") do | f |
|
45
|
+
options[:sds_conf] = File.expand_path(f)
|
46
|
+
raise Errno::ENOENT unless File.exists? options[:sds_conf]
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on('-c', '--[no-]cleardb', "Clear database") do | cd |
|
50
|
+
options[:cleardb] = cd
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on('-d', "--max-depth DEPTH", OptionParser::DecimalInteger,
|
54
|
+
'Maximum depth of url to crawl') do | d |
|
55
|
+
options[:max_depth] = d
|
56
|
+
end
|
57
|
+
end.parse!
|
58
|
+
|
59
|
+
|
60
|
+
NewsCrawler::CrawlerConfig.load_application_config(options[:app_conf]) unless options[:app_conf].nil?
|
61
|
+
NewsCrawler::CrawlerConfig.load_samedomainselector_config(options[:sds_conf]) unless options[:sds_conf].nil?
|
62
|
+
|
63
|
+
config = SimpleConfig.for :application
|
64
|
+
NewsCrawler::Storage::RawData.set_engine(config.db.engine.intern)
|
65
|
+
NewsCrawler::Storage::URLQueue.set_engine(config.db.engine.intern)
|
66
|
+
|
67
|
+
if (options[:cleardb])
|
68
|
+
URLQueue.clear
|
69
|
+
RawData.clear
|
70
|
+
end
|
71
|
+
|
72
|
+
if ARGV.size > 0
|
73
|
+
url = ARGV[0]
|
74
|
+
URLQueue.add(url)
|
75
|
+
end
|
76
|
+
|
77
|
+
puts "Starting Downloader"
|
78
|
+
dwl = NewsCrawler::Downloader.new(false)
|
79
|
+
dwl.async.run
|
80
|
+
|
81
|
+
puts "Starting SDS"
|
82
|
+
se = NewsCrawler::LinkSelector::SameDomainSelector.new(options[:max_depth] || 1, false)
|
83
|
+
se.async.run
|
84
|
+
puts "Stoping SDS"
|
85
|
+
se.graceful_terminate
|
86
|
+
se.terminate
|
87
|
+
puts "SDS stopped"
|
88
|
+
|
89
|
+
sleep(5)
|
90
|
+
|
91
|
+
puts "Stoping Downloader"
|
92
|
+
dwl.graceful_terminate
|
93
|
+
dwl.terminate
|
94
|
+
puts "Downloader stopped"
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#--
|
2
|
+
# NewsCrawler - a website crawler
|
3
|
+
#
|
4
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
5
|
+
#
|
6
|
+
# This file is part of NewsCrawler.
|
7
|
+
#
|
8
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
|
22
|
+
require 'simple_config'
|
23
|
+
require 'mongo'
|
24
|
+
|
25
|
+
require 'news_crawler/config'
|
26
|
+
require 'news_crawler/storage/url_queue'
|
27
|
+
|
28
|
+
SimpleConfig.for(:application) do
|
29
|
+
set :prefix, 'test'
|
30
|
+
end
|
31
|
+
|
32
|
+
config = SimpleConfig.for :application
|
33
|
+
NewsCrawler::Storage::URLQueue.set_engine(config.db.engine.intern)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'simple_config'
|
24
|
+
|
25
|
+
module NewsCrawler
|
26
|
+
class CrawlerConfig
|
27
|
+
DEFAULT_CONFIG = File.join(File.dirname(__FILE__),
|
28
|
+
'./default_config.yml')
|
29
|
+
DEFAULT_SDS_CONFIG = File.join(File.dirname(__FILE__),
|
30
|
+
'./default_sds.yml')
|
31
|
+
|
32
|
+
def self.load_application_config(file = CrawlerConfig::DEFAULT_CONFIG)
|
33
|
+
if ((file != DEFAULT_CONFIG) || (@app_loaded != true))
|
34
|
+
@app_loaded = true
|
35
|
+
SimpleConfig.for :application do
|
36
|
+
load file
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.load_samedomainselector_config(file = CrawlerConfig::DEFAULT_SDS_CONFIG)
|
42
|
+
if ((file != DEFAULT_SDS_CONFIG) || (@sds_loaded != true))
|
43
|
+
@sds_loaded = true
|
44
|
+
SimpleConfig.for :same_domain_selector do
|
45
|
+
load file
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
NewsCrawler::CrawlerConfig.load_application_config
|
53
|
+
NewsCrawler::CrawlerConfig.load_samedomainselector_config
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#--
|
2
|
+
# NewsCrawler - a website crawler
|
3
|
+
#
|
4
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
5
|
+
#
|
6
|
+
# This file is part of NewsCrawler.
|
7
|
+
#
|
8
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
|
22
|
+
require 'news_crawler/storage/url_queue'
|
23
|
+
require 'thread'
|
24
|
+
|
25
|
+
module NewsCrawler
|
26
|
+
# Include this to get basic module methods
|
27
|
+
module CrawlerModule
|
28
|
+
# Mark current url process state of current module is processed
|
29
|
+
# @param [ String ] url
|
30
|
+
def mark_processed(url)
|
31
|
+
URLQueue.mark(self.class.name, url, URLQueue::PROCESSED)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Mark current url process state of current module is unprocessed
|
35
|
+
# @param [ String ] url
|
36
|
+
def mark_unprocessed(url)
|
37
|
+
URLQueue.mark(self.class.name, url, URLQueue::UNPROCESSED)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Find all visited unprocessed url
|
41
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
42
|
+
# @return [ Array ] URL list
|
43
|
+
def find_unprocessed(max_depth = -1)
|
44
|
+
URLQueue.find_all(self.class.name, URLQueue::UNPROCESSED, max_depth)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Find one visited url with given current module process state
|
48
|
+
# @param [ String ] state one of unprocessed, processing, processed
|
49
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
50
|
+
# @return [ Array ] URL list
|
51
|
+
def find_all(state, max_depth = -1)
|
52
|
+
URLQueue.find_all(self.class.name, state, max_depth)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Find all visited urls with current module's state
|
56
|
+
# @param [ String ] state
|
57
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
58
|
+
# @return [ String, nil ] URL or nil if url doesn't exists
|
59
|
+
def find_one(state, max_depth = -1)
|
60
|
+
URLQueue.find_one(self.class.name, state, max_depth)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get next unprocessed a url and mark it as processing in atomic
|
64
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
65
|
+
# @return [ String, nil ] URL or nil if url doesn't exists
|
66
|
+
def next_unprocessed(max_depth = -1)
|
67
|
+
URLQueue.next_unprocessed(self.class.name, max_depth)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
:exclude:
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'celluloid'
|
24
|
+
require 'typhoeus'
|
25
|
+
require 'simpleconfig'
|
26
|
+
|
27
|
+
require 'news_crawler/config'
|
28
|
+
require 'news_crawler/storage/raw_data'
|
29
|
+
require 'news_crawler/utils/robots_patch'
|
30
|
+
require 'news_crawler/nc_logger'
|
31
|
+
|
32
|
+
module NewsCrawler
|
33
|
+
# This class implement an parallel downloader based on Typhoes
|
34
|
+
# with given queue
|
35
|
+
class Downloader
|
36
|
+
include Celluloid
|
37
|
+
|
38
|
+
CONCURRENT_DOWNLOAD = 4
|
39
|
+
|
40
|
+
# Construct downloader with an URLQueue
|
41
|
+
# @param [ Boolean ] start_on_create whether start selector immediately
|
42
|
+
# @param [ NewsCrawler::URLQueue ] queue url queue
|
43
|
+
def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
|
44
|
+
@queue = queue
|
45
|
+
@urls = queue.find_unvisited
|
46
|
+
@concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
|
47
|
+
@wait_time = 1
|
48
|
+
@status = :running
|
49
|
+
@stoping = false
|
50
|
+
wait_for_url if start_on_create
|
51
|
+
end
|
52
|
+
|
53
|
+
# Start downloader with current queue
|
54
|
+
# URL successed fetch is marked and result's stored in DB
|
55
|
+
def run
|
56
|
+
@status = :running
|
57
|
+
hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
|
58
|
+
# TODO Log here
|
59
|
+
@urls = @urls.keep_if do | url |
|
60
|
+
Robots.instance.allowed? url
|
61
|
+
end
|
62
|
+
requests = @urls.map do | url |
|
63
|
+
re = Typhoeus::Request.new(url, followlocation: true)
|
64
|
+
re.on_complete do | response |
|
65
|
+
if response.success?
|
66
|
+
Storage::RawData.add(url, response.response_body)
|
67
|
+
@queue.mark_visited url
|
68
|
+
else
|
69
|
+
NCLogger.get_logger.warn("[WARNING] Fetch error [#{url}]")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
hydra.queue re
|
73
|
+
re
|
74
|
+
end
|
75
|
+
hydra.run
|
76
|
+
@urls = []
|
77
|
+
wait_for_url
|
78
|
+
end
|
79
|
+
|
80
|
+
# Graceful terminate this downloader
|
81
|
+
def graceful_terminate
|
82
|
+
@stoping = true
|
83
|
+
while @status == :running
|
84
|
+
sleep(1)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
# Waiting for new urls're added to queue, using backoff algorithms
|
90
|
+
def wait_for_url
|
91
|
+
@status = :waiting
|
92
|
+
if @stoping # check for stop flag
|
93
|
+
return
|
94
|
+
end
|
95
|
+
sleep @wait_time
|
96
|
+
get_new_url
|
97
|
+
if @urls.size == 0
|
98
|
+
if @wait_time < 30
|
99
|
+
@wait_time = @wait_time * 2
|
100
|
+
end
|
101
|
+
wait_for_url
|
102
|
+
else
|
103
|
+
@wait_time = 1
|
104
|
+
run
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def get_new_url
|
109
|
+
@urls = @queue.find_unvisited
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,172 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'celluloid'
|
24
|
+
require 'nokogiri'
|
25
|
+
|
26
|
+
require 'news_crawler/storage/raw_data'
|
27
|
+
require 'news_crawler/url_helper'
|
28
|
+
require 'news_crawler/crawler_module'
|
29
|
+
require 'news_crawler/nc_logger'
|
30
|
+
|
31
|
+
module NewsCrawler
|
32
|
+
module LinkSelector
|
33
|
+
# Select all link from same domain.
|
34
|
+
# Domain is got from database
|
35
|
+
class SameDomainSelector
|
36
|
+
include NewsCrawler::URLHelper
|
37
|
+
extend NewsCrawler::URLHelper
|
38
|
+
|
39
|
+
include NewsCrawler::CrawlerModule
|
40
|
+
include Celluloid
|
41
|
+
|
42
|
+
# Create new selector with queue
|
43
|
+
# URL's selected is put back into queue
|
44
|
+
# @param [ Fixnum ] max_depth maxinum depth to crawl
|
45
|
+
# @param [ Boolean ] start_on_create whether start selector immediately
|
46
|
+
def initialize(max_depth = -1, start_on_create = true)
|
47
|
+
@max_depth = max_depth
|
48
|
+
@wait_time = 1
|
49
|
+
@status = :running
|
50
|
+
@stoping = false
|
51
|
+
run if start_on_create
|
52
|
+
end
|
53
|
+
|
54
|
+
# Extract url from page
|
55
|
+
def extract_url(url)
|
56
|
+
doc = RawData.find_by_url(url)
|
57
|
+
html_doc = Nokogiri::HTML(doc)
|
58
|
+
results = []
|
59
|
+
|
60
|
+
inner_url = html_doc.xpath('//a').collect { | a_el |
|
61
|
+
temp_url = (a_el.attribute 'href').to_s
|
62
|
+
if (!temp_url.nil?) && (temp_url[0] == '/')
|
63
|
+
temp_url = url + temp_url
|
64
|
+
end
|
65
|
+
temp_url
|
66
|
+
}
|
67
|
+
|
68
|
+
inner_url.delete_if { | url |
|
69
|
+
(url.nil?) || (url.size == 0) || (url == '#')
|
70
|
+
}
|
71
|
+
|
72
|
+
# select url from same domain
|
73
|
+
inner_url.select { | o_url |
|
74
|
+
if (same_domain?(o_url, url))
|
75
|
+
if (!SameDomainSelector.exclude?(o_url))
|
76
|
+
begin
|
77
|
+
URLQueue.add(o_url, url)
|
78
|
+
results << [o_url, url]
|
79
|
+
rescue URLQueue::DuplicateURLError => e
|
80
|
+
end
|
81
|
+
else
|
82
|
+
# TODO Log here
|
83
|
+
end
|
84
|
+
end
|
85
|
+
}
|
86
|
+
end
|
87
|
+
|
88
|
+
def run
|
89
|
+
@status = :running
|
90
|
+
return if @stoping
|
91
|
+
if @max_depth == 0
|
92
|
+
@status = :stopped
|
93
|
+
return
|
94
|
+
end
|
95
|
+
while !@stoping
|
96
|
+
url = next_unprocessed(@max_depth - 1)
|
97
|
+
while (url.nil?)
|
98
|
+
wait_for_url
|
99
|
+
url = next_unprocessed(@max_depth - 1)
|
100
|
+
end
|
101
|
+
NCLogger.get_logger.info "Processing #{url}"
|
102
|
+
extract_url(url)
|
103
|
+
mark_processed(url)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Test whether url is excluded
|
108
|
+
# @param [ String ] url
|
109
|
+
# @return [ Boolean ] true if url is excluded, false otherwise
|
110
|
+
def self.exclude?(url)
|
111
|
+
config = SimpleConfig.for :same_domain_selector
|
112
|
+
exclude_list = []
|
113
|
+
url_domain = get_url_path(url)[:domain]
|
114
|
+
begin
|
115
|
+
exclude_group = config.exclude
|
116
|
+
rescue NoMethodError => e
|
117
|
+
return false
|
118
|
+
end
|
119
|
+
|
120
|
+
exclude_group.to_hash.keys.each do | url_e |
|
121
|
+
if url_domain.to_s.end_with? url_e.to_s
|
122
|
+
exclude_list = config.exclude.get(url_e)
|
123
|
+
break
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
exclude_list = exclude_list.map do | elt |
|
128
|
+
if /^\/.*\/$/ =~ elt
|
129
|
+
Regexp.new(elt[1..-2]) # already an Regex
|
130
|
+
else
|
131
|
+
new_elt = "^(.*/)?#{elt}(/.*)?$"
|
132
|
+
Regexp.new(new_elt)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
if exclude_list.count == 0
|
137
|
+
return false
|
138
|
+
end
|
139
|
+
|
140
|
+
# url.split('/').each do | part |
|
141
|
+
# if exclude_list.include? part
|
142
|
+
# return true
|
143
|
+
# end
|
144
|
+
# end
|
145
|
+
exclude_list.each do | exclude_rule |
|
146
|
+
if exclude_rule =~ url
|
147
|
+
return true
|
148
|
+
end
|
149
|
+
end
|
150
|
+
return false
|
151
|
+
end
|
152
|
+
|
153
|
+
# Graceful terminate this selector
|
154
|
+
def graceful_terminate
|
155
|
+
@stoping = true
|
156
|
+
while @status == :running
|
157
|
+
sleep(1)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
private
|
162
|
+
# Waiting for new urls're added to queue, using backoff algorithms
|
163
|
+
def wait_for_url
|
164
|
+
@status = :waiting
|
165
|
+
sleep @wait_time
|
166
|
+
if @wait_time < 30
|
167
|
+
@wait_times = @wait_time * 2
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
#--
|
5
|
+
# NewsCrawler - a website crawler
|
6
|
+
#
|
7
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
8
|
+
#
|
9
|
+
# This file is part of NewsCrawler.
|
10
|
+
#
|
11
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
12
|
+
# it under the terms of the GNU General Public License as published by
|
13
|
+
# the Free Software Foundation, either version 3 of the License, or
|
14
|
+
# (at your option) any later version.
|
15
|
+
#
|
16
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
17
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
# GNU General Public License for more details.
|
20
|
+
#
|
21
|
+
# You should have received a copy of the GNU General Public License
|
22
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
23
|
+
#++
|
24
|
+
|
25
|
+
require 'logger'
|
26
|
+
|
27
|
+
module NewsCrawler
|
28
|
+
class NCLogger
|
29
|
+
# Get logger
|
30
|
+
def self.get_logger
|
31
|
+
@logger ||= Logger.new(STDERR)
|
32
|
+
@logger.progname = 'news_crawler'
|
33
|
+
@logger
|
34
|
+
end
|
35
|
+
|
36
|
+
# Set logger level
|
37
|
+
# param [ Logger::Severity ] l level
|
38
|
+
def self.set_level(l)
|
39
|
+
get_logger.level = l
|
40
|
+
end
|
41
|
+
|
42
|
+
# Set logger, should same API as Ruby Logger
|
43
|
+
# param [ Object ] l logger
|
44
|
+
def self.set_logdev(ld)
|
45
|
+
@logger = Logger.new(ld)
|
46
|
+
@logger.progname = 'news_crawler'
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'mongo'
|
24
|
+
require 'simple_config'
|
25
|
+
require 'news_crawler/storage/raw_data/raw_data_engine'
|
26
|
+
|
27
|
+
|
28
|
+
module NewsCrawler
|
29
|
+
module Storage
|
30
|
+
module RawData
|
31
|
+
# Raw data storage implement using MongoDB
|
32
|
+
class MongoStorage < NewsCrawler::Storage::RawData::RawDataEngine
|
33
|
+
NAME = 'mongo'
|
34
|
+
|
35
|
+
include Mongo
|
36
|
+
|
37
|
+
def initialize(*opts)
|
38
|
+
config = (SimpleConfig.for :application)
|
39
|
+
client = MongoClient.new(config.mongodb.host, config.mongodb.port)
|
40
|
+
db = client[config.mongodb.db_name]
|
41
|
+
@coll = db[config.prefix + '_' + config.suffix.raw_data]
|
42
|
+
@coll.ensure_index({:url => Mongo::ASCENDING}, {:unique => true})
|
43
|
+
end
|
44
|
+
|
45
|
+
# Add entry to raw data collection, overwrite old data
|
46
|
+
# param [ String ] url
|
47
|
+
# param [ String ] body
|
48
|
+
def add(url, body)
|
49
|
+
@coll.update({:url => url},
|
50
|
+
{:$set => {:body => body}},
|
51
|
+
{:upsert => true})
|
52
|
+
end
|
53
|
+
|
54
|
+
# Find document with correspond url
|
55
|
+
# @param [ String ] url
|
56
|
+
# @return [ String, nil ]
|
57
|
+
def find_by_url(url)
|
58
|
+
result = @coll.find_one({:url => url})
|
59
|
+
if (!result.nil?)
|
60
|
+
result['body']
|
61
|
+
else
|
62
|
+
nil
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Get number of raw data entries
|
67
|
+
def count
|
68
|
+
@coll.count
|
69
|
+
end
|
70
|
+
|
71
|
+
def clear
|
72
|
+
@coll.remove
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|