news_crawler 0.0.0.pre.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/news_crawler +94 -0
- data/lib/news_crawler/autostart.rb +33 -0
- data/lib/news_crawler/config.rb +53 -0
- data/lib/news_crawler/crawler_module.rb +70 -0
- data/lib/news_crawler/default_config.yml +13 -0
- data/lib/news_crawler/default_sds.yml +1 -0
- data/lib/news_crawler/downloader.rb +112 -0
- data/lib/news_crawler/link_selector/same_domain_selector.rb +172 -0
- data/lib/news_crawler/nc_logger.rb +49 -0
- data/lib/news_crawler/storage/raw_data/mongo_storage.rb +77 -0
- data/lib/news_crawler/storage/raw_data/raw_data_engine.rb +67 -0
- data/lib/news_crawler/storage/raw_data.rb +74 -0
- data/lib/news_crawler/storage/url_queue/mongo_storage.rb +218 -0
- data/lib/news_crawler/storage/url_queue/url_queue_engine.rb +124 -0
- data/lib/news_crawler/storage/url_queue/url_queue_error.rb +28 -0
- data/lib/news_crawler/storage/url_queue.rb +150 -0
- data/lib/news_crawler/url_helper.rb +50 -0
- data/lib/news_crawler/utils/robots_patch.rb +34 -0
- data/lib/news_crawler.rb +47 -0
- metadata +203 -0
@@ -0,0 +1,67 @@
|
|
1
|
+
#--
|
2
|
+
# NewsCrawler - a website crawler
|
3
|
+
#
|
4
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
5
|
+
#
|
6
|
+
# This file is part of NewsCrawler.
|
7
|
+
#
|
8
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
|
22
|
+
module NewsCrawler
|
23
|
+
module Storage
|
24
|
+
module RawData
|
25
|
+
# Basic class for RawData engine.
|
26
|
+
# Subclass and implement all its method to create new RawData engine,
|
27
|
+
# you should keep methods' singature unchanged
|
28
|
+
class RawDataEngine
|
29
|
+
def self.inherited(klass)
|
30
|
+
@engine_list = (@engine_list || []) + [klass]
|
31
|
+
end
|
32
|
+
|
33
|
+
# Get engine list
|
34
|
+
# @return [ Array ] list of url queue engines
|
35
|
+
def self.get_engines
|
36
|
+
@engine_list = @engine_list || []
|
37
|
+
@engine_list.inject({}) do | memo, klass |
|
38
|
+
memo[klass::NAME.intern] = klass
|
39
|
+
memo
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Add entry to raw data collection
|
44
|
+
# param [ String ] url
|
45
|
+
# param [ String ] body
|
46
|
+
def add(url, body)
|
47
|
+
raise NotImplementedError
|
48
|
+
end
|
49
|
+
|
50
|
+
# Find document with correspond url
|
51
|
+
# @param [ String ] url
|
52
|
+
# @return [ String, nil ]
|
53
|
+
def find_by_url(url)
|
54
|
+
raise NotImplementedError
|
55
|
+
end
|
56
|
+
|
57
|
+
def count
|
58
|
+
raise NotImplementedError
|
59
|
+
end
|
60
|
+
|
61
|
+
def clear
|
62
|
+
raise NotImplementedError
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'simpleconfig'
|
24
|
+
|
25
|
+
require 'news_crawler/storage/raw_data/mongo_storage'
|
26
|
+
require 'news_crawler/storage/raw_data/raw_data_engine'
|
27
|
+
|
28
|
+
module NewsCrawler
|
29
|
+
module Storage
|
30
|
+
# store raw data from website
|
31
|
+
module RawData
|
32
|
+
class << self
|
33
|
+
# Set RawData storage engine
|
34
|
+
# @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
|
35
|
+
# @param [ Hash ] opts options pass to engine
|
36
|
+
# This can be
|
37
|
+
# * `:mongo`, `:mongodb` for MongoDB backend
|
38
|
+
def set_engine(engine, *opts)
|
39
|
+
if engine.respond_to? :intern
|
40
|
+
engine = engine.intern
|
41
|
+
end
|
42
|
+
engine_class = RawDataEngine.get_engines[engine]
|
43
|
+
if engine_class
|
44
|
+
@engine = engine_class.new(*opts)
|
45
|
+
else
|
46
|
+
@engine = engine
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Add entry to raw data collection
|
51
|
+
# param [ String ] url
|
52
|
+
# param [ String ] body
|
53
|
+
def add(url, body)
|
54
|
+
@engine.add(url, body)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Find document with correspond url
|
58
|
+
# @param [ String ] url
|
59
|
+
# @return [ String, nil ]
|
60
|
+
def find_by_url(url)
|
61
|
+
@engine.find_by_url url
|
62
|
+
end
|
63
|
+
|
64
|
+
def count
|
65
|
+
@engine.count
|
66
|
+
end
|
67
|
+
|
68
|
+
def clear
|
69
|
+
@engine.clear
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,218 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'mongo'
|
24
|
+
require 'news_crawler/storage/url_queue/url_queue_error'
|
25
|
+
require 'news_crawler/storage/url_queue/url_queue_engine'
|
26
|
+
|
27
|
+
module NewsCrawler
|
28
|
+
module Storage
|
29
|
+
module URLQueue
|
30
|
+
# List storage engine with MongoDB backend
|
31
|
+
class MongoEngine < NewsCrawler::Storage::URLQueue::URLQueueEngine
|
32
|
+
NAME = 'mongo'
|
33
|
+
|
34
|
+
require 'mongo'
|
35
|
+
include Mongo
|
36
|
+
|
37
|
+
# Construct a queue
|
38
|
+
def initialize(*opts)
|
39
|
+
config = SimpleConfig.for :application
|
40
|
+
db = MongoClient.new(config.mongodb.host, config.mongodb.port,
|
41
|
+
pool_size: 4,
|
42
|
+
pool_timeout: 5)[config.mongodb.db_name]
|
43
|
+
coll_name = config.prefix + '_' + config.suffix.url_queue
|
44
|
+
h_opts = ((opts[-1].is_a? Hash) ? opts[-1] : {})
|
45
|
+
@coll = db[h_opts[:coll_name] || coll_name]
|
46
|
+
@coll.ensure_index({:url => Mongo::ASCENDING}, {:unique => true})
|
47
|
+
end
|
48
|
+
|
49
|
+
# Add an URL to list with reference URL
|
50
|
+
# @param [ String ] url
|
51
|
+
# @param [ String ] ref_url
|
52
|
+
def add(url, ref_url = '')
|
53
|
+
if (ref_url == '')
|
54
|
+
depth = 0
|
55
|
+
else
|
56
|
+
depth = (get_url_depth(ref_url) || 0) + 1
|
57
|
+
end
|
58
|
+
begin
|
59
|
+
@coll.insert({:url => url,
|
60
|
+
:depth => depth,
|
61
|
+
:visited => false})
|
62
|
+
rescue Mongo::OperationFailure => e
|
63
|
+
if e.error_code == 11000 # duplicate key error
|
64
|
+
raise DuplicateURLError, url
|
65
|
+
else
|
66
|
+
raise e
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Mark an URL as visited
|
72
|
+
# @param [ String ] url
|
73
|
+
def mark_visited(url)
|
74
|
+
@coll.update({:url => url},
|
75
|
+
{:$set => {'visited' => true}})
|
76
|
+
end
|
77
|
+
|
78
|
+
# Mark all URLs as unvisited
|
79
|
+
def mark_all_unvisited
|
80
|
+
@coll.update({},
|
81
|
+
{:$set => {'visited' => false}},
|
82
|
+
{:multi => true})
|
83
|
+
end
|
84
|
+
|
85
|
+
# # Mark an URL as processed
|
86
|
+
# # @param [ String ] url
|
87
|
+
# def mark_processed(url, **opts)
|
88
|
+
# @coll.update({:url => url},
|
89
|
+
# {:$set => {:processed => true}})
|
90
|
+
# end
|
91
|
+
|
92
|
+
# Set processing state of url in given module
|
93
|
+
# @param [ String ] module_name
|
94
|
+
# @param [ String ] url
|
95
|
+
# @param [ String ] state one of unprocessed, processing, processed
|
96
|
+
def mark(module_name, url, state)
|
97
|
+
@coll.update({:url => url},
|
98
|
+
{:$set => {module_name => state}})
|
99
|
+
end
|
100
|
+
|
101
|
+
# Change all url in an state to other state
|
102
|
+
# @param [ String ] module_name
|
103
|
+
# @param [ String ] new_state new state
|
104
|
+
# @param [ String ] orig_state original state
|
105
|
+
def mark_all(module_name, new_state, orig_state = nil)
|
106
|
+
selector = (orig_state.nil? ? {} : {module_name => orig_state})
|
107
|
+
@coll.update(selector,
|
108
|
+
{:$set => {module_name => new_state}},
|
109
|
+
:multi => true)
|
110
|
+
end
|
111
|
+
|
112
|
+
# Get all URL and status
|
113
|
+
# @return [ Array ] array of hash contains url and status
|
114
|
+
def all(*opts)
|
115
|
+
@coll.find.collect do | entry |
|
116
|
+
entry.each_key.inject({}) do | memo, key |
|
117
|
+
if key != '_id'
|
118
|
+
memo[key.intern] = entry[key]
|
119
|
+
end
|
120
|
+
memo
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# TODO fix bug - find *visited* url
|
126
|
+
# Find all visited urls with given module process state
|
127
|
+
# @param [ String ] modul_name
|
128
|
+
# @param [ String ] state one of unprocessed, processing, processed
|
129
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
130
|
+
# @return [ Array ] URL list
|
131
|
+
def find_all(modul_name, state, max_depth = -1)
|
132
|
+
if (state == URLQueue::UNPROCESSED)
|
133
|
+
selector = {:$or => [{modul_name => state},
|
134
|
+
{modul_name => {:$exists => false}}]}
|
135
|
+
else
|
136
|
+
selector = {modul_name => state}
|
137
|
+
end
|
138
|
+
selector = {:$and => [selector,
|
139
|
+
{'visited' => true}]}
|
140
|
+
if max_depth > -1
|
141
|
+
selector[:$and] << {'depth' => {:$lte => max_depth}}
|
142
|
+
end
|
143
|
+
@coll.find(selector).collect do | entry |
|
144
|
+
entry['url']
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
# Find one visited url with given module process state
|
149
|
+
# @param [ String ] modul_name
|
150
|
+
# @param [ String ] state one of unprocessed, processing, processed
|
151
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
152
|
+
# @return [ String, nil ] URL or nil if cann't found url matches criterial
|
153
|
+
def find_one(modul_name, state, max_depth = -1)
|
154
|
+
a = find_all(modul_name, state, max_depth)
|
155
|
+
if a.size > 0
|
156
|
+
a[0]
|
157
|
+
else
|
158
|
+
nil
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Get next unprocessed a url and mark it as processing in atomic
|
163
|
+
# @param [ String ] modul_name
|
164
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
165
|
+
# @return [ String, nil ] URL or nil if url doesn't exists
|
166
|
+
def next_unprocessed(modul_name, max_depth = -1)
|
167
|
+
selector = {:$or => [{modul_name => URLQueue::UNPROCESSED},
|
168
|
+
{modul_name => {:$exists => false}}]}
|
169
|
+
selector = {:$and => [selector,
|
170
|
+
{'visited' => true}]}
|
171
|
+
if max_depth > -1
|
172
|
+
selector[:$and] << {'depth' => {:$lte => max_depth}}
|
173
|
+
end
|
174
|
+
doc = @coll.find_and_modify(:query => selector,
|
175
|
+
:update => {:$set =>
|
176
|
+
{modul_name => URLQueue::PROCESSING}})
|
177
|
+
if doc.nil?
|
178
|
+
nil
|
179
|
+
else
|
180
|
+
doc['url']
|
181
|
+
end
|
182
|
+
(doc.nil? ? nil : doc['url'])
|
183
|
+
end
|
184
|
+
alias :find_and_mark :next_unprocessed
|
185
|
+
|
186
|
+
# Get list of unvisited URL
|
187
|
+
# @param [ Fixnum ] max_depth maximum depth of url return
|
188
|
+
# @return [ Array ] unvisited url with maximum depth (option)
|
189
|
+
def find_unvisited(max_depth = -1)
|
190
|
+
if max_depth > -1
|
191
|
+
selector = {:$and => [{'visited' => false},
|
192
|
+
{'depth' => {:$lte => max_depth}}]}
|
193
|
+
else
|
194
|
+
selector = {'visited' => false}
|
195
|
+
end
|
196
|
+
@coll.find(selector).collect do | entry |
|
197
|
+
entry['url']
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# Clear URL queue
|
202
|
+
# @return [ Fixnum ] number of urls removed
|
203
|
+
def clear(*opts)
|
204
|
+
count = @coll.count
|
205
|
+
@coll.remove
|
206
|
+
count
|
207
|
+
end
|
208
|
+
|
209
|
+
# Get URL depth of given url
|
210
|
+
# @param [ String ] url
|
211
|
+
# return [ Fixnum ] URL depth
|
212
|
+
def get_url_depth(url)
|
213
|
+
@coll.find_one({'url' => url}, {:fields => ['depth']})['depth']
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
module NewsCrawler
|
24
|
+
module Storage
|
25
|
+
module URLQueue
|
26
|
+
# Basic class for URLQueue engine.
|
27
|
+
# Subclass and implement all its method to create new URLQueue engine,
|
28
|
+
# you should keep methods' singature unchanged
|
29
|
+
class URLQueueEngine
|
30
|
+
def self.inherited(klass)
|
31
|
+
@engine_list = (@engine_list || []) + [klass]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Get engine list
|
35
|
+
# @return [ Array ] list of url queue engines
|
36
|
+
def self.get_engines
|
37
|
+
@engine_list = @engine_list || []
|
38
|
+
@engine_list.inject({}) do | memo, klass |
|
39
|
+
memo[klass::NAME.intern] = klass
|
40
|
+
memo
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Set processing state of url in given module
|
45
|
+
# @param [ String ] module_name
|
46
|
+
# @param [ String ] url
|
47
|
+
# @param [ String ] state one of unprocessed, processing, processed
|
48
|
+
def mark(module_name, url, state)
|
49
|
+
raise NotImplementedError
|
50
|
+
end
|
51
|
+
|
52
|
+
# Change all url in an state to other state
|
53
|
+
# @param [ String ] module_name
|
54
|
+
# @param [ String ] new_state new state
|
55
|
+
# @param [ String ] orig_state original state
|
56
|
+
def mark_all(module_name, new_state, orig_state = nil)
|
57
|
+
raise NotImplementedError
|
58
|
+
end
|
59
|
+
|
60
|
+
# Produce next unprocessed url and mark it as processing
|
61
|
+
# @param [ String ] module_name
|
62
|
+
# @return [ String, nil ]
|
63
|
+
def next_unprocessed(module_name)
|
64
|
+
raise NotImplementedError
|
65
|
+
end
|
66
|
+
|
67
|
+
# Find all visited urls with module's state
|
68
|
+
# @param [ String ] module_name
|
69
|
+
# @param [ String ] state
|
70
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
71
|
+
# @return [ Array ] URL list
|
72
|
+
def find_all(module_name, state, max_depth = -1)
|
73
|
+
raise NotImplementedError
|
74
|
+
end
|
75
|
+
|
76
|
+
# Find one visited url with given module process state
|
77
|
+
# @param [ String ] module_name
|
78
|
+
# @param [ String ] state one of unprocessed, processing, processed
|
79
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
80
|
+
# @return [ String, nil ] URL
|
81
|
+
def find_one(module_name, state, max_depth = -1)
|
82
|
+
raise NotImplementedError
|
83
|
+
end
|
84
|
+
|
85
|
+
# Get list of unvisited URL
|
86
|
+
# @param [ Fixnum ] max_depth maximum depth of url return
|
87
|
+
# @return [ Array ] unvisited url with maximum depth (option)
|
88
|
+
def find_unvisited(max_depth = -1)
|
89
|
+
raise NotImplementedError
|
90
|
+
end
|
91
|
+
|
92
|
+
# Add url with reference url
|
93
|
+
# @param [ String ] url URL
|
94
|
+
# @param [ String ] ref_url reference URL
|
95
|
+
def add(url, ref_url = '')
|
96
|
+
raise NotImplementedError
|
97
|
+
end
|
98
|
+
|
99
|
+
# Clear URLQueue
|
100
|
+
# @return [ Fixnum ] number of urls removed
|
101
|
+
def clear
|
102
|
+
raise NotImplementedError
|
103
|
+
end
|
104
|
+
|
105
|
+
# Mark an URL as visited
|
106
|
+
# @param [ String ] url
|
107
|
+
def mark_visited(url)
|
108
|
+
raise NotImplementedError
|
109
|
+
end
|
110
|
+
|
111
|
+
# Mark all URLs as unvisited
|
112
|
+
def mark_all_unvisited
|
113
|
+
raise NotImplementedError
|
114
|
+
end
|
115
|
+
|
116
|
+
# Get all url with status
|
117
|
+
# @return [ Array ] URL list
|
118
|
+
def all
|
119
|
+
raise NotImplementedError
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#--
|
2
|
+
# NewsCrawler - a website crawler
|
3
|
+
#
|
4
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
5
|
+
#
|
6
|
+
# This file is part of NewsCrawler.
|
7
|
+
#
|
8
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
9
|
+
# it under the terms of the GNU General Public License as published by
|
10
|
+
# the Free Software Foundation, either version 3 of the License, or
|
11
|
+
# (at your option) any later version.
|
12
|
+
#
|
13
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
14
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
+
# GNU General Public License for more details.
|
17
|
+
#
|
18
|
+
# You should have received a copy of the GNU General Public License
|
19
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
20
|
+
#++
|
21
|
+
|
22
|
+
module NewsCrawler
|
23
|
+
module Storage
|
24
|
+
module URLQueue
|
25
|
+
class DuplicateURLError < StandardError; end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,150 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'simple_config'
|
24
|
+
require 'news_crawler/storage/url_queue/mongo_storage'
|
25
|
+
require 'news_crawler/storage/url_queue/url_queue_engine'
|
26
|
+
|
27
|
+
module NewsCrawler
|
28
|
+
module Storage
|
29
|
+
# Store and manipulate url queue
|
30
|
+
module URLQueue
|
31
|
+
ACTION_LIST = [:mark_visited, :mark_processed, :find_unvisited,
|
32
|
+
:find_unprocessed, :find_unprocessed_with_depth]
|
33
|
+
PROCESSED = 'processed'
|
34
|
+
PROCESSING = 'processing'
|
35
|
+
UNPROCESSED = 'unprocessed'
|
36
|
+
|
37
|
+
class << self
|
38
|
+
# Set URLQueue storage engine
|
39
|
+
# @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
|
40
|
+
# @param [ Hash ] opts options pass to engine
|
41
|
+
# This can be
|
42
|
+
# * `:mongo`, `:mongodb` for MongoDB backend
|
43
|
+
def set_engine(engine, *opts)
|
44
|
+
if engine.respond_to? :intern
|
45
|
+
engine = engine.intern
|
46
|
+
end
|
47
|
+
engine_class = URLQueueEngine.get_engines[engine]
|
48
|
+
if engine_class
|
49
|
+
@engine = engine_class.new(*opts)
|
50
|
+
else
|
51
|
+
@engine = engine
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Mark an URL as visited
|
56
|
+
# @param [ String ] url
|
57
|
+
def mark_visited(url)
|
58
|
+
url = normalize_url url
|
59
|
+
@engine.mark_visited(url)
|
60
|
+
end
|
61
|
+
|
62
|
+
# Mark all URLs as unvisited
|
63
|
+
def mark_all_unvisited
|
64
|
+
@engine.mark_all_unvisited
|
65
|
+
end
|
66
|
+
|
67
|
+
# Set processing state of url in given module
|
68
|
+
# @param [ String ] module_name
|
69
|
+
# @param [ String ] url
|
70
|
+
# @param [ String ] state one of unprocessed, processing, processed
|
71
|
+
def mark(module_name, url, state)
|
72
|
+
url = normalize_url url
|
73
|
+
@engine.mark(module_name, url, state)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Mark all url to state
|
77
|
+
# @param [ String ] module_name
|
78
|
+
# @param [ String ] new_state new state
|
79
|
+
# @param [ String ] orig_state original state
|
80
|
+
def mark_all(module_name, new_state, orig_state = nil)
|
81
|
+
@engine.mark_all(module_name, new_state, orig_state)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Find all visited urls with module's state
|
85
|
+
# @param [ String ] module_name
|
86
|
+
# @param [ String ] state
|
87
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
88
|
+
# @return [ Array ] URL list
|
89
|
+
def find_all(module_name, state, max_depth = -1)
|
90
|
+
@engine.find_all(module_name, state, max_depth)
|
91
|
+
end
|
92
|
+
|
93
|
+
# Find one visited url with given module process state
|
94
|
+
# @param [ String ] module_name
|
95
|
+
# @param [ String ] state one of unprocessed, processing, processed
|
96
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
97
|
+
# @return [ String, nil ] URL
|
98
|
+
def find_one(module_name, state, max_depth = -1)
|
99
|
+
@engine.find_one(module_name, state, max_depth)
|
100
|
+
end
|
101
|
+
|
102
|
+
# Get next unprocessed a url and mark it as processing in atomic
|
103
|
+
# @param [ String ] module_name
|
104
|
+
# @param [ Fixnum ] max_depth max url depth return (inclusive)
|
105
|
+
# @return [ String, nil ] URL or nil if url doesn't exists
|
106
|
+
def next_unprocessed(module_name, max_depth = -1)
|
107
|
+
@engine.next_unprocessed(module_name, max_depth)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Get list of unvisited URL
|
111
|
+
# @param [ Fixnum ] max_depth maximum depth of url return
|
112
|
+
# @return [ Array ] unvisited url with maximum depth (option)
|
113
|
+
def find_unvisited(max_depth = -1)
|
114
|
+
@engine.find_unvisited(max_depth)
|
115
|
+
end
|
116
|
+
|
117
|
+
# Add URL to queue
|
118
|
+
# @param [ String ] url
|
119
|
+
# @param [ String ] ref_url reference url
|
120
|
+
def add(url, ref_url = '')
|
121
|
+
url = normalize_url url
|
122
|
+
if ref_url != ''
|
123
|
+
ref_url = normalize_url ref_url
|
124
|
+
end
|
125
|
+
@engine.add(url, ref_url)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Clear URLQueue
|
129
|
+
# @return [ Fixnum ] number of urls removed
|
130
|
+
def clear
|
131
|
+
@engine.clear
|
132
|
+
end
|
133
|
+
|
134
|
+
# Get all url with status
|
135
|
+
# @return [ Array ] URL list
|
136
|
+
def all
|
137
|
+
@engine.all
|
138
|
+
end
|
139
|
+
|
140
|
+
def normalize_url(url)
|
141
|
+
if (!url.start_with? "http")
|
142
|
+
"http://" + url
|
143
|
+
else
|
144
|
+
url
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|