news_crawler 0.0.0.pre.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/news_crawler +94 -0
- data/lib/news_crawler/autostart.rb +33 -0
- data/lib/news_crawler/config.rb +53 -0
- data/lib/news_crawler/crawler_module.rb +70 -0
- data/lib/news_crawler/default_config.yml +13 -0
- data/lib/news_crawler/default_sds.yml +1 -0
- data/lib/news_crawler/downloader.rb +112 -0
- data/lib/news_crawler/link_selector/same_domain_selector.rb +172 -0
- data/lib/news_crawler/nc_logger.rb +49 -0
- data/lib/news_crawler/storage/raw_data/mongo_storage.rb +77 -0
- data/lib/news_crawler/storage/raw_data/raw_data_engine.rb +67 -0
- data/lib/news_crawler/storage/raw_data.rb +74 -0
- data/lib/news_crawler/storage/url_queue/mongo_storage.rb +218 -0
- data/lib/news_crawler/storage/url_queue/url_queue_engine.rb +124 -0
- data/lib/news_crawler/storage/url_queue/url_queue_error.rb +28 -0
- data/lib/news_crawler/storage/url_queue.rb +150 -0
- data/lib/news_crawler/url_helper.rb +50 -0
- data/lib/news_crawler/utils/robots_patch.rb +34 -0
- data/lib/news_crawler.rb +47 -0
- metadata +203 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 3cdcc66767575d17c8a1f5d25f00713f74d51e98
|
4
|
+
data.tar.gz: 7c7d2c838066a45c4365a0756625e4a08c5d4482
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 75f259b0cafbff494302955a1b7968d331d69298c8e5a31185b0b8288408f7af323721eb83a2043005e2acb7e12337051622e191356da497892eb35701dfe7d8
|
7
|
+
data.tar.gz: 4382b0ed8d4bc28134d8af322f9a2f46b95a8732b2629835ae0b49da4e3ecdaa6fcce671c7fd5adc8cc198fb8e107360678277767b9cc9d706bae7a6530b2dd4
|
data/bin/news_crawler
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
#--
|
5
|
+
# NewsCrawler - a website crawler
|
6
|
+
#
|
7
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
8
|
+
#
|
9
|
+
# This file is part of NewsCrawler.
|
10
|
+
#
|
11
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
12
|
+
# it under the terms of the GNU General Public License as published by
|
13
|
+
# the Free Software Foundation, either version 3 of the License, or
|
14
|
+
# (at your option) any later version.
|
15
|
+
#
|
16
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
17
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
# GNU General Public License for more details.
|
20
|
+
#
|
21
|
+
# You should have received a copy of the GNU General Public License
|
22
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
23
|
+
#++
|
24
|
+
|
25
|
+
require 'optparse'
|
26
|
+
require 'news_crawler/config'
|
27
|
+
require 'news_crawler/nc_logger'
|
28
|
+
|
29
|
+
require 'news_crawler/downloader'
|
30
|
+
require 'news_crawler/link_selector/same_domain_selector'
|
31
|
+
|
32
|
+
include NewsCrawler::Storage
|
33
|
+
|
34
|
+
options = {}
|
35
|
+
|
36
|
+
OptionParser.new do | opts |
|
37
|
+
opts.banner = "Usage: news_crawler [options] url"
|
38
|
+
|
39
|
+
opts.on('-c', "--app-conf FILE", "Application configuration file") do | f |
|
40
|
+
options[:app_conf] = File.expand_path(f)
|
41
|
+
raise Errno::ENOENT unless File.exists? options[:app_conf]
|
42
|
+
end
|
43
|
+
|
44
|
+
opts.on('-sds', "--sds-conf FILE", "Same domain selector configuration file") do | f |
|
45
|
+
options[:sds_conf] = File.expand_path(f)
|
46
|
+
raise Errno::ENOENT unless File.exists? options[:sds_conf]
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on('-c', '--[no-]cleardb', "Clear database") do | cd |
|
50
|
+
options[:cleardb] = cd
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on('-d', "--max-depth DEPTH", OptionParser::DecimalInteger,
|
54
|
+
'Maximum depth of url to crawl') do | d |
|
55
|
+
options[:max_depth] = d
|
56
|
+
end
|
57
|
+
end.parse!
|
58
|
+
|
59
|
+
|
60
|
+
NewsCrawler::CrawlerConfig.load_application_config(options[:app_conf]) unless options[:app_conf].nil?
|
61
|
+
NewsCrawler::CrawlerConfig.load_samedomainselector_config(options[:sds_conf]) unless options[:sds_conf].nil?
|
62
|
+
|
63
|
+
config = SimpleConfig.for :application
|
64
|
+
NewsCrawler::Storage::RawData.set_engine(config.db.engine.intern)
|
65
|
+
NewsCrawler::Storage::URLQueue.set_engine(config.db.engine.intern)
|
66
|
+
|
67
|
+
if (options[:cleardb])
|
68
|
+
URLQueue.clear
|
69
|
+
RawData.clear
|
70
|
+
end
|
71
|
+
|
72
|
+
if ARGV.size > 0
|
73
|
+
url = ARGV[0]
|
74
|
+
URLQueue.add(url)
|
75
|
+
end
|
76
|
+
|
77
|
+
puts "Starting Downloader"
|
78
|
+
dwl = NewsCrawler::Downloader.new(false)
|
79
|
+
dwl.async.run
|
80
|
+
|
81
|
+
puts "Starting SDS"
|
82
|
+
se = NewsCrawler::LinkSelector::SameDomainSelector.new(options[:max_depth] || 1, false)
|
83
|
+
se.async.run
|
84
|
+
puts "Stoping SDS"
|
85
|
+
se.graceful_terminate
|
86
|
+
se.terminate
|
87
|
+
puts "SDS stopped"
|
88
|
+
|
89
|
+
sleep(5)
|
90
|
+
|
91
|
+
puts "Stoping Downloader"
|
92
|
+
dwl.graceful_terminate
|
93
|
+
dwl.terminate
|
94
|
+
puts "Downloader stopped"
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#--
|
2
|
+
# NewsCrawler - a website crawler
|
3
|
+
#
|
4
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
5
|
+
#
|
6
|
+
# This file is part of NewsCrawler.
|
7
|
+
#
|
8
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
|
22
|
+
require 'simple_config'
|
23
|
+
require 'mongo'
|
24
|
+
|
25
|
+
require 'news_crawler/config'
|
26
|
+
require 'news_crawler/storage/url_queue'
|
27
|
+
|
28
|
+
SimpleConfig.for(:application) do
|
29
|
+
set :prefix, 'test'
|
30
|
+
end
|
31
|
+
|
32
|
+
config = SimpleConfig.for :application
|
33
|
+
NewsCrawler::Storage::URLQueue.set_engine(config.db.engine.intern)
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'simple_config'
|
24
|
+
|
25
|
+
module NewsCrawler
|
26
|
+
class CrawlerConfig
|
27
|
+
DEFAULT_CONFIG = File.join(File.dirname(__FILE__),
|
28
|
+
'./default_config.yml')
|
29
|
+
DEFAULT_SDS_CONFIG = File.join(File.dirname(__FILE__),
|
30
|
+
'./default_sds.yml')
|
31
|
+
|
32
|
+
def self.load_application_config(file = CrawlerConfig::DEFAULT_CONFIG)
|
33
|
+
if ((file != DEFAULT_CONFIG) || (@app_loaded != true))
|
34
|
+
@app_loaded = true
|
35
|
+
SimpleConfig.for :application do
|
36
|
+
load file
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.load_samedomainselector_config(file = CrawlerConfig::DEFAULT_SDS_CONFIG)
|
42
|
+
if ((file != DEFAULT_SDS_CONFIG) || (@sds_loaded != true))
|
43
|
+
@sds_loaded = true
|
44
|
+
SimpleConfig.for :same_domain_selector do
|
45
|
+
load file
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
NewsCrawler::CrawlerConfig.load_application_config
|
53
|
+
NewsCrawler::CrawlerConfig.load_samedomainselector_config
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#--
|
2
|
+
# NewsCrawler - a website crawler
|
3
|
+
#
|
4
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
5
|
+
#
|
6
|
+
# This file is part of NewsCrawler.
|
7
|
+
#
|
8
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
|
22
|
+
require 'news_crawler/storage/url_queue'
|
23
|
+
require 'thread'
|
24
|
+
|
25
|
+
module NewsCrawler
|
26
|
+
# Include this to get basic module methods
|
27
|
+
module CrawlerModule
|
28
|
+
# Mark current url process state of current module is processed
|
29
|
+
# @param [ String ] url
|
30
|
+
def mark_processed(url)
|
31
|
+
URLQueue.mark(self.class.name, url, URLQueue::PROCESSED)
|
32
|
+
end
|
33
|
+
|
34
|
+
# Mark current url process state of current module is unprocessed
|
35
|
+
# @param [ String ] url
|
36
|
+
def mark_unprocessed(url)
|
37
|
+
URLQueue.mark(self.class.name, url, URLQueue::UNPROCESSED)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Find all visited unprocessed url
|
41
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
42
|
+
# @return [ Array ] URL list
|
43
|
+
def find_unprocessed(max_depth = -1)
|
44
|
+
URLQueue.find_all(self.class.name, URLQueue::UNPROCESSED, max_depth)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Find one visited url with given current module process state
|
48
|
+
# @param [ String ] state one of unprocessed, processing, processed
|
49
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
50
|
+
# @return [ Array ] URL list
|
51
|
+
def find_all(state, max_depth = -1)
|
52
|
+
URLQueue.find_all(self.class.name, state, max_depth)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Find all visited urls with current module's state
|
56
|
+
# @param [ String ] state
|
57
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
58
|
+
# @return [ String, nil ] URL or nil if url doesn't exists
|
59
|
+
def find_one(state, max_depth = -1)
|
60
|
+
URLQueue.find_one(self.class.name, state, max_depth)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Get next unprocessed a url and mark it as processing in atomic
|
64
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
65
|
+
# @return [ String, nil ] URL or nil if url doesn't exists
|
66
|
+
def next_unprocessed(max_depth = -1)
|
67
|
+
URLQueue.next_unprocessed(self.class.name, max_depth)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
:exclude:
|
@@ -0,0 +1,112 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'celluloid'
|
24
|
+
require 'typhoeus'
|
25
|
+
require 'simpleconfig'
|
26
|
+
|
27
|
+
require 'news_crawler/config'
|
28
|
+
require 'news_crawler/storage/raw_data'
|
29
|
+
require 'news_crawler/utils/robots_patch'
|
30
|
+
require 'news_crawler/nc_logger'
|
31
|
+
|
32
|
+
module NewsCrawler
|
33
|
+
# This class implement an parallel downloader based on Typhoes
|
34
|
+
# with given queue
|
35
|
+
class Downloader
|
36
|
+
include Celluloid
|
37
|
+
|
38
|
+
CONCURRENT_DOWNLOAD = 4
|
39
|
+
|
40
|
+
# Construct downloader with an URLQueue
|
41
|
+
# @param [ Boolean ] start_on_create whether start selector immediately
|
42
|
+
# @param [ NewsCrawler::URLQueue ] queue url queue
|
43
|
+
def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
|
44
|
+
@queue = queue
|
45
|
+
@urls = queue.find_unvisited
|
46
|
+
@concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
|
47
|
+
@wait_time = 1
|
48
|
+
@status = :running
|
49
|
+
@stoping = false
|
50
|
+
wait_for_url if start_on_create
|
51
|
+
end
|
52
|
+
|
53
|
+
# Start downloader with current queue
|
54
|
+
# URL successed fetch is marked and result's stored in DB
|
55
|
+
def run
|
56
|
+
@status = :running
|
57
|
+
hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
|
58
|
+
# TODO Log here
|
59
|
+
@urls = @urls.keep_if do | url |
|
60
|
+
Robots.instance.allowed? url
|
61
|
+
end
|
62
|
+
requests = @urls.map do | url |
|
63
|
+
re = Typhoeus::Request.new(url, followlocation: true)
|
64
|
+
re.on_complete do | response |
|
65
|
+
if response.success?
|
66
|
+
Storage::RawData.add(url, response.response_body)
|
67
|
+
@queue.mark_visited url
|
68
|
+
else
|
69
|
+
NCLogger.get_logger.warn("[WARNING] Fetch error [#{url}]")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
hydra.queue re
|
73
|
+
re
|
74
|
+
end
|
75
|
+
hydra.run
|
76
|
+
@urls = []
|
77
|
+
wait_for_url
|
78
|
+
end
|
79
|
+
|
80
|
+
# Graceful terminate this downloader
|
81
|
+
def graceful_terminate
|
82
|
+
@stoping = true
|
83
|
+
while @status == :running
|
84
|
+
sleep(1)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
# Waiting for new urls're added to queue, using backoff algorithms
|
90
|
+
def wait_for_url
|
91
|
+
@status = :waiting
|
92
|
+
if @stoping # check for stop flag
|
93
|
+
return
|
94
|
+
end
|
95
|
+
sleep @wait_time
|
96
|
+
get_new_url
|
97
|
+
if @urls.size == 0
|
98
|
+
if @wait_time < 30
|
99
|
+
@wait_time = @wait_time * 2
|
100
|
+
end
|
101
|
+
wait_for_url
|
102
|
+
else
|
103
|
+
@wait_time = 1
|
104
|
+
run
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def get_new_url
|
109
|
+
@urls = @queue.find_unvisited
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,172 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'celluloid'
|
24
|
+
require 'nokogiri'
|
25
|
+
|
26
|
+
require 'news_crawler/storage/raw_data'
|
27
|
+
require 'news_crawler/url_helper'
|
28
|
+
require 'news_crawler/crawler_module'
|
29
|
+
require 'news_crawler/nc_logger'
|
30
|
+
|
31
|
+
module NewsCrawler
|
32
|
+
module LinkSelector
|
33
|
+
# Select all link from same domain.
|
34
|
+
# Domain is got from database
|
35
|
+
class SameDomainSelector
|
36
|
+
include NewsCrawler::URLHelper
|
37
|
+
extend NewsCrawler::URLHelper
|
38
|
+
|
39
|
+
include NewsCrawler::CrawlerModule
|
40
|
+
include Celluloid
|
41
|
+
|
42
|
+
# Create new selector with queue
|
43
|
+
# URL's selected is put back into queue
|
44
|
+
# @param [ Fixnum ] max_depth maxinum depth to crawl
|
45
|
+
# @param [ Boolean ] start_on_create whether start selector immediately
|
46
|
+
def initialize(max_depth = -1, start_on_create = true)
|
47
|
+
@max_depth = max_depth
|
48
|
+
@wait_time = 1
|
49
|
+
@status = :running
|
50
|
+
@stoping = false
|
51
|
+
run if start_on_create
|
52
|
+
end
|
53
|
+
|
54
|
+
# Extract url from page
|
55
|
+
def extract_url(url)
|
56
|
+
doc = RawData.find_by_url(url)
|
57
|
+
html_doc = Nokogiri::HTML(doc)
|
58
|
+
results = []
|
59
|
+
|
60
|
+
inner_url = html_doc.xpath('//a').collect { | a_el |
|
61
|
+
temp_url = (a_el.attribute 'href').to_s
|
62
|
+
if (!temp_url.nil?) && (temp_url[0] == '/')
|
63
|
+
temp_url = url + temp_url
|
64
|
+
end
|
65
|
+
temp_url
|
66
|
+
}
|
67
|
+
|
68
|
+
inner_url.delete_if { | url |
|
69
|
+
(url.nil?) || (url.size == 0) || (url == '#')
|
70
|
+
}
|
71
|
+
|
72
|
+
# select url from same domain
|
73
|
+
inner_url.select { | o_url |
|
74
|
+
if (same_domain?(o_url, url))
|
75
|
+
if (!SameDomainSelector.exclude?(o_url))
|
76
|
+
begin
|
77
|
+
URLQueue.add(o_url, url)
|
78
|
+
results << [o_url, url]
|
79
|
+
rescue URLQueue::DuplicateURLError => e
|
80
|
+
end
|
81
|
+
else
|
82
|
+
# TODO Log here
|
83
|
+
end
|
84
|
+
end
|
85
|
+
}
|
86
|
+
end
|
87
|
+
|
88
|
+
def run
|
89
|
+
@status = :running
|
90
|
+
return if @stoping
|
91
|
+
if @max_depth == 0
|
92
|
+
@status = :stopped
|
93
|
+
return
|
94
|
+
end
|
95
|
+
while !@stoping
|
96
|
+
url = next_unprocessed(@max_depth - 1)
|
97
|
+
while (url.nil?)
|
98
|
+
wait_for_url
|
99
|
+
url = next_unprocessed(@max_depth - 1)
|
100
|
+
end
|
101
|
+
NCLogger.get_logger.info "Processing #{url}"
|
102
|
+
extract_url(url)
|
103
|
+
mark_processed(url)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Test whether url is excluded
|
108
|
+
# @param [ String ] url
|
109
|
+
# @return [ Boolean ] true if url is excluded, false otherwise
|
110
|
+
def self.exclude?(url)
|
111
|
+
config = SimpleConfig.for :same_domain_selector
|
112
|
+
exclude_list = []
|
113
|
+
url_domain = get_url_path(url)[:domain]
|
114
|
+
begin
|
115
|
+
exclude_group = config.exclude
|
116
|
+
rescue NoMethodError => e
|
117
|
+
return false
|
118
|
+
end
|
119
|
+
|
120
|
+
exclude_group.to_hash.keys.each do | url_e |
|
121
|
+
if url_domain.to_s.end_with? url_e.to_s
|
122
|
+
exclude_list = config.exclude.get(url_e)
|
123
|
+
break
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
exclude_list = exclude_list.map do | elt |
|
128
|
+
if /^\/.*\/$/ =~ elt
|
129
|
+
Regexp.new(elt[1..-2]) # already an Regex
|
130
|
+
else
|
131
|
+
new_elt = "^(.*/)?#{elt}(/.*)?$"
|
132
|
+
Regexp.new(new_elt)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
if exclude_list.count == 0
|
137
|
+
return false
|
138
|
+
end
|
139
|
+
|
140
|
+
# url.split('/').each do | part |
|
141
|
+
# if exclude_list.include? part
|
142
|
+
# return true
|
143
|
+
# end
|
144
|
+
# end
|
145
|
+
exclude_list.each do | exclude_rule |
|
146
|
+
if exclude_rule =~ url
|
147
|
+
return true
|
148
|
+
end
|
149
|
+
end
|
150
|
+
return false
|
151
|
+
end
|
152
|
+
|
153
|
+
# Graceful terminate this selector
|
154
|
+
def graceful_terminate
|
155
|
+
@stoping = true
|
156
|
+
while @status == :running
|
157
|
+
sleep(1)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
private
|
162
|
+
# Waiting for new urls're added to queue, using backoff algorithms
|
163
|
+
def wait_for_url
|
164
|
+
@status = :waiting
|
165
|
+
sleep @wait_time
|
166
|
+
if @wait_time < 30
|
167
|
+
@wait_times = @wait_time * 2
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
#--
|
5
|
+
# NewsCrawler - a website crawler
|
6
|
+
#
|
7
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
8
|
+
#
|
9
|
+
# This file is part of NewsCrawler.
|
10
|
+
#
|
11
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
12
|
+
# it under the terms of the GNU General Public License as published by
|
13
|
+
# the Free Software Foundation, either version 3 of the License, or
|
14
|
+
# (at your option) any later version.
|
15
|
+
#
|
16
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
17
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
18
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
19
|
+
# GNU General Public License for more details.
|
20
|
+
#
|
21
|
+
# You should have received a copy of the GNU General Public License
|
22
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
23
|
+
#++
|
24
|
+
|
25
|
+
require 'logger'
|
26
|
+
|
27
|
+
module NewsCrawler
|
28
|
+
class NCLogger
|
29
|
+
# Get logger
|
30
|
+
def self.get_logger
|
31
|
+
@logger ||= Logger.new(STDERR)
|
32
|
+
@logger.progname = 'news_crawler'
|
33
|
+
@logger
|
34
|
+
end
|
35
|
+
|
36
|
+
# Set logger level
|
37
|
+
# param [ Logger::Severity ] l level
|
38
|
+
def self.set_level(l)
|
39
|
+
get_logger.level = l
|
40
|
+
end
|
41
|
+
|
42
|
+
# Set logger, should same API as Ruby Logger
|
43
|
+
# param [ Object ] l logger
|
44
|
+
def self.set_logdev(ld)
|
45
|
+
@logger = Logger.new(ld)
|
46
|
+
@logger.progname = 'news_crawler'
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'mongo'
|
24
|
+
require 'simple_config'
|
25
|
+
require 'news_crawler/storage/raw_data/raw_data_engine'
|
26
|
+
|
27
|
+
|
28
|
+
module NewsCrawler
|
29
|
+
module Storage
|
30
|
+
module RawData
|
31
|
+
# Raw data storage implement using MongoDB
|
32
|
+
class MongoStorage < NewsCrawler::Storage::RawData::RawDataEngine
|
33
|
+
NAME = 'mongo'
|
34
|
+
|
35
|
+
include Mongo
|
36
|
+
|
37
|
+
def initialize(*opts)
|
38
|
+
config = (SimpleConfig.for :application)
|
39
|
+
client = MongoClient.new(config.mongodb.host, config.mongodb.port)
|
40
|
+
db = client[config.mongodb.db_name]
|
41
|
+
@coll = db[config.prefix + '_' + config.suffix.raw_data]
|
42
|
+
@coll.ensure_index({:url => Mongo::ASCENDING}, {:unique => true})
|
43
|
+
end
|
44
|
+
|
45
|
+
# Add entry to raw data collection, overwrite old data
|
46
|
+
# param [ String ] url
|
47
|
+
# param [ String ] body
|
48
|
+
def add(url, body)
|
49
|
+
@coll.update({:url => url},
|
50
|
+
{:$set => {:body => body}},
|
51
|
+
{:upsert => true})
|
52
|
+
end
|
53
|
+
|
54
|
+
# Find document with correspond url
|
55
|
+
# @param [ String ] url
|
56
|
+
# @return [ String, nil ]
|
57
|
+
def find_by_url(url)
|
58
|
+
result = @coll.find_one({:url => url})
|
59
|
+
if (!result.nil?)
|
60
|
+
result['body']
|
61
|
+
else
|
62
|
+
nil
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Get number of raw data entries
|
67
|
+
def count
|
68
|
+
@coll.count
|
69
|
+
end
|
70
|
+
|
71
|
+
def clear
|
72
|
+
@coll.remove
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|