news_crawler 0.0.0.pre.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,67 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ module NewsCrawler
23
+ module Storage
24
+ module RawData
25
+ # Basic class for RawData engine.
26
+ # Subclass and implement all its method to create new RawData engine,
27
+ # you should keep methods' singature unchanged
28
+ class RawDataEngine
29
+ def self.inherited(klass)
30
+ @engine_list = (@engine_list || []) + [klass]
31
+ end
32
+
33
+ # Get engine list
34
+ # @return [ Array ] list of url queue engines
35
+ def self.get_engines
36
+ @engine_list = @engine_list || []
37
+ @engine_list.inject({}) do | memo, klass |
38
+ memo[klass::NAME.intern] = klass
39
+ memo
40
+ end
41
+ end
42
+
43
+ # Add entry to raw data collection
44
+ # param [ String ] url
45
+ # param [ String ] body
46
+ def add(url, body)
47
+ raise NotImplementedError
48
+ end
49
+
50
+ # Find document with correspond url
51
+ # @param [ String ] url
52
+ # @return [ String, nil ]
53
+ def find_by_url(url)
54
+ raise NotImplementedError
55
+ end
56
+
57
+ def count
58
+ raise NotImplementedError
59
+ end
60
+
61
+ def clear
62
+ raise NotImplementedError
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,74 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'simpleconfig'
24
+
25
+ require 'news_crawler/storage/raw_data/mongo_storage'
26
+ require 'news_crawler/storage/raw_data/raw_data_engine'
27
+
28
+ module NewsCrawler
29
+ module Storage
30
+ # store raw data from website
31
+ module RawData
32
+ class << self
33
+ # Set RawData storage engine
34
+ # @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
35
+ # @param [ Hash ] opts options pass to engine
36
+ # This can be
37
+ # * `:mongo`, `:mongodb` for MongoDB backend
38
+ def set_engine(engine, *opts)
39
+ if engine.respond_to? :intern
40
+ engine = engine.intern
41
+ end
42
+ engine_class = RawDataEngine.get_engines[engine]
43
+ if engine_class
44
+ @engine = engine_class.new(*opts)
45
+ else
46
+ @engine = engine
47
+ end
48
+ end
49
+
50
+ # Add entry to raw data collection
51
+ # param [ String ] url
52
+ # param [ String ] body
53
+ def add(url, body)
54
+ @engine.add(url, body)
55
+ end
56
+
57
+ # Find document with correspond url
58
+ # @param [ String ] url
59
+ # @return [ String, nil ]
60
+ def find_by_url(url)
61
+ @engine.find_by_url url
62
+ end
63
+
64
+ def count
65
+ @engine.count
66
+ end
67
+
68
+ def clear
69
+ @engine.clear
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,218 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'mongo'
24
+ require 'news_crawler/storage/url_queue/url_queue_error'
25
+ require 'news_crawler/storage/url_queue/url_queue_engine'
26
+
27
+ module NewsCrawler
28
+ module Storage
29
+ module URLQueue
30
+ # List storage engine with MongoDB backend
31
+ class MongoEngine < NewsCrawler::Storage::URLQueue::URLQueueEngine
32
+ NAME = 'mongo'
33
+
34
+ require 'mongo'
35
+ include Mongo
36
+
37
+ # Construct a queue
38
+ def initialize(*opts)
39
+ config = SimpleConfig.for :application
40
+ db = MongoClient.new(config.mongodb.host, config.mongodb.port,
41
+ pool_size: 4,
42
+ pool_timeout: 5)[config.mongodb.db_name]
43
+ coll_name = config.prefix + '_' + config.suffix.url_queue
44
+ h_opts = ((opts[-1].is_a? Hash) ? opts[-1] : {})
45
+ @coll = db[h_opts[:coll_name] || coll_name]
46
+ @coll.ensure_index({:url => Mongo::ASCENDING}, {:unique => true})
47
+ end
48
+
49
+ # Add an URL to list with reference URL
50
+ # @param [ String ] url
51
+ # @param [ String ] ref_url
52
+ def add(url, ref_url = '')
53
+ if (ref_url == '')
54
+ depth = 0
55
+ else
56
+ depth = (get_url_depth(ref_url) || 0) + 1
57
+ end
58
+ begin
59
+ @coll.insert({:url => url,
60
+ :depth => depth,
61
+ :visited => false})
62
+ rescue Mongo::OperationFailure => e
63
+ if e.error_code == 11000 # duplicate key error
64
+ raise DuplicateURLError, url
65
+ else
66
+ raise e
67
+ end
68
+ end
69
+ end
70
+
71
+ # Mark an URL as visited
72
+ # @param [ String ] url
73
+ def mark_visited(url)
74
+ @coll.update({:url => url},
75
+ {:$set => {'visited' => true}})
76
+ end
77
+
78
+ # Mark all URLs as unvisited
79
+ def mark_all_unvisited
80
+ @coll.update({},
81
+ {:$set => {'visited' => false}},
82
+ {:multi => true})
83
+ end
84
+
85
+ # # Mark an URL as processed
86
+ # # @param [ String ] url
87
+ # def mark_processed(url, **opts)
88
+ # @coll.update({:url => url},
89
+ # {:$set => {:processed => true}})
90
+ # end
91
+
92
+ # Set processing state of url in given module
93
+ # @param [ String ] module_name
94
+ # @param [ String ] url
95
+ # @param [ String ] state one of unprocessed, processing, processed
96
+ def mark(module_name, url, state)
97
+ @coll.update({:url => url},
98
+ {:$set => {module_name => state}})
99
+ end
100
+
101
+ # Change all url in an state to other state
102
+ # @param [ String ] module_name
103
+ # @param [ String ] new_state new state
104
+ # @param [ String ] orig_state original state
105
+ def mark_all(module_name, new_state, orig_state = nil)
106
+ selector = (orig_state.nil? ? {} : {module_name => orig_state})
107
+ @coll.update(selector,
108
+ {:$set => {module_name => new_state}},
109
+ :multi => true)
110
+ end
111
+
112
+ # Get all URL and status
113
+ # @return [ Array ] array of hash contains url and status
114
+ def all(*opts)
115
+ @coll.find.collect do | entry |
116
+ entry.each_key.inject({}) do | memo, key |
117
+ if key != '_id'
118
+ memo[key.intern] = entry[key]
119
+ end
120
+ memo
121
+ end
122
+ end
123
+ end
124
+
125
+ # TODO fix bug - find *visited* url
126
+ # Find all visited urls with given module process state
127
+ # @param [ String ] modul_name
128
+ # @param [ String ] state one of unprocessed, processing, processed
129
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
130
+ # @return [ Array ] URL list
131
+ def find_all(modul_name, state, max_depth = -1)
132
+ if (state == URLQueue::UNPROCESSED)
133
+ selector = {:$or => [{modul_name => state},
134
+ {modul_name => {:$exists => false}}]}
135
+ else
136
+ selector = {modul_name => state}
137
+ end
138
+ selector = {:$and => [selector,
139
+ {'visited' => true}]}
140
+ if max_depth > -1
141
+ selector[:$and] << {'depth' => {:$lte => max_depth}}
142
+ end
143
+ @coll.find(selector).collect do | entry |
144
+ entry['url']
145
+ end
146
+ end
147
+
148
+ # Find one visited url with given module process state
149
+ # @param [ String ] modul_name
150
+ # @param [ String ] state one of unprocessed, processing, processed
151
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
152
+ # @return [ String, nil ] URL or nil if cann't found url matches criterial
153
+ def find_one(modul_name, state, max_depth = -1)
154
+ a = find_all(modul_name, state, max_depth)
155
+ if a.size > 0
156
+ a[0]
157
+ else
158
+ nil
159
+ end
160
+ end
161
+
162
+ # Get next unprocessed a url and mark it as processing in atomic
163
+ # @param [ String ] modul_name
164
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
165
+ # @return [ String, nil ] URL or nil if url doesn't exists
166
+ def next_unprocessed(modul_name, max_depth = -1)
167
+ selector = {:$or => [{modul_name => URLQueue::UNPROCESSED},
168
+ {modul_name => {:$exists => false}}]}
169
+ selector = {:$and => [selector,
170
+ {'visited' => true}]}
171
+ if max_depth > -1
172
+ selector[:$and] << {'depth' => {:$lte => max_depth}}
173
+ end
174
+ doc = @coll.find_and_modify(:query => selector,
175
+ :update => {:$set =>
176
+ {modul_name => URLQueue::PROCESSING}})
177
+ if doc.nil?
178
+ nil
179
+ else
180
+ doc['url']
181
+ end
182
+ (doc.nil? ? nil : doc['url'])
183
+ end
184
+ alias :find_and_mark :next_unprocessed
185
+
186
+ # Get list of unvisited URL
187
+ # @param [ Fixnum ] max_depth maximum depth of url return
188
+ # @return [ Array ] unvisited url with maximum depth (option)
189
+ def find_unvisited(max_depth = -1)
190
+ if max_depth > -1
191
+ selector = {:$and => [{'visited' => false},
192
+ {'depth' => {:$lte => max_depth}}]}
193
+ else
194
+ selector = {'visited' => false}
195
+ end
196
+ @coll.find(selector).collect do | entry |
197
+ entry['url']
198
+ end
199
+ end
200
+
201
+ # Clear URL queue
202
+ # @return [ Fixnum ] number of urls removed
203
+ def clear(*opts)
204
+ count = @coll.count
205
+ @coll.remove
206
+ count
207
+ end
208
+
209
+ # Get URL depth of given url
210
+ # @param [ String ] url
211
+ # return [ Fixnum ] URL depth
212
+ def get_url_depth(url)
213
+ @coll.find_one({'url' => url}, {:fields => ['depth']})['depth']
214
+ end
215
+ end
216
+ end
217
+ end
218
+ end
@@ -0,0 +1,124 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ module NewsCrawler
24
+ module Storage
25
+ module URLQueue
26
+ # Basic class for URLQueue engine.
27
+ # Subclass and implement all its method to create new URLQueue engine,
28
+ # you should keep methods' singature unchanged
29
+ class URLQueueEngine
30
+ def self.inherited(klass)
31
+ @engine_list = (@engine_list || []) + [klass]
32
+ end
33
+
34
+ # Get engine list
35
+ # @return [ Array ] list of url queue engines
36
+ def self.get_engines
37
+ @engine_list = @engine_list || []
38
+ @engine_list.inject({}) do | memo, klass |
39
+ memo[klass::NAME.intern] = klass
40
+ memo
41
+ end
42
+ end
43
+
44
+ # Set processing state of url in given module
45
+ # @param [ String ] module_name
46
+ # @param [ String ] url
47
+ # @param [ String ] state one of unprocessed, processing, processed
48
+ def mark(module_name, url, state)
49
+ raise NotImplementedError
50
+ end
51
+
52
+ # Change all url in an state to other state
53
+ # @param [ String ] module_name
54
+ # @param [ String ] new_state new state
55
+ # @param [ String ] orig_state original state
56
+ def mark_all(module_name, new_state, orig_state = nil)
57
+ raise NotImplementedError
58
+ end
59
+
60
+ # Produce next unprocessed url and mark it as processing
61
+ # @param [ String ] module_name
62
+ # @return [ String, nil ]
63
+ def next_unprocessed(module_name)
64
+ raise NotImplementedError
65
+ end
66
+
67
+ # Find all visited urls with module's state
68
+ # @param [ String ] module_name
69
+ # @param [ String ] state
70
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
71
+ # @return [ Array ] URL list
72
+ def find_all(module_name, state, max_depth = -1)
73
+ raise NotImplementedError
74
+ end
75
+
76
+ # Find one visited url with given module process state
77
+ # @param [ String ] module_name
78
+ # @param [ String ] state one of unprocessed, processing, processed
79
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
80
+ # @return [ String, nil ] URL
81
+ def find_one(module_name, state, max_depth = -1)
82
+ raise NotImplementedError
83
+ end
84
+
85
+ # Get list of unvisited URL
86
+ # @param [ Fixnum ] max_depth maximum depth of url return
87
+ # @return [ Array ] unvisited url with maximum depth (option)
88
+ def find_unvisited(max_depth = -1)
89
+ raise NotImplementedError
90
+ end
91
+
92
+ # Add url with reference url
93
+ # @param [ String ] url URL
94
+ # @param [ String ] ref_url reference URL
95
+ def add(url, ref_url = '')
96
+ raise NotImplementedError
97
+ end
98
+
99
+ # Clear URLQueue
100
+ # @return [ Fixnum ] number of urls removed
101
+ def clear
102
+ raise NotImplementedError
103
+ end
104
+
105
+ # Mark an URL as visited
106
+ # @param [ String ] url
107
+ def mark_visited(url)
108
+ raise NotImplementedError
109
+ end
110
+
111
+ # Mark all URLs as unvisited
112
+ def mark_all_unvisited
113
+ raise NotImplementedError
114
+ end
115
+
116
+ # Get all url with status
117
+ # @return [ Array ] URL list
118
+ def all
119
+ raise NotImplementedError
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,28 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ module NewsCrawler
23
+ module Storage
24
+ module URLQueue
25
+ class DuplicateURLError < StandardError; end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,150 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'simple_config'
24
+ require 'news_crawler/storage/url_queue/mongo_storage'
25
+ require 'news_crawler/storage/url_queue/url_queue_engine'
26
+
27
+ module NewsCrawler
28
+ module Storage
29
+ # Store and manipulate url queue
30
+ module URLQueue
31
+ ACTION_LIST = [:mark_visited, :mark_processed, :find_unvisited,
32
+ :find_unprocessed, :find_unprocessed_with_depth]
33
+ PROCESSED = 'processed'
34
+ PROCESSING = 'processing'
35
+ UNPROCESSED = 'unprocessed'
36
+
37
+ class << self
38
+ # Set URLQueue storage engine
39
+ # @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
40
+ # @param [ Hash ] opts options pass to engine
41
+ # This can be
42
+ # * `:mongo`, `:mongodb` for MongoDB backend
43
+ def set_engine(engine, *opts)
44
+ if engine.respond_to? :intern
45
+ engine = engine.intern
46
+ end
47
+ engine_class = URLQueueEngine.get_engines[engine]
48
+ if engine_class
49
+ @engine = engine_class.new(*opts)
50
+ else
51
+ @engine = engine
52
+ end
53
+ end
54
+
55
+ # Mark an URL as visited
56
+ # @param [ String ] url
57
+ def mark_visited(url)
58
+ url = normalize_url url
59
+ @engine.mark_visited(url)
60
+ end
61
+
62
+ # Mark all URLs as unvisited
63
+ def mark_all_unvisited
64
+ @engine.mark_all_unvisited
65
+ end
66
+
67
+ # Set processing state of url in given module
68
+ # @param [ String ] module_name
69
+ # @param [ String ] url
70
+ # @param [ String ] state one of unprocessed, processing, processed
71
+ def mark(module_name, url, state)
72
+ url = normalize_url url
73
+ @engine.mark(module_name, url, state)
74
+ end
75
+
76
+ # Mark all url to state
77
+ # @param [ String ] module_name
78
+ # @param [ String ] new_state new state
79
+ # @param [ String ] orig_state original state
80
+ def mark_all(module_name, new_state, orig_state = nil)
81
+ @engine.mark_all(module_name, new_state, orig_state)
82
+ end
83
+
84
+ # Find all visited urls with module's state
85
+ # @param [ String ] module_name
86
+ # @param [ String ] state
87
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
88
+ # @return [ Array ] URL list
89
+ def find_all(module_name, state, max_depth = -1)
90
+ @engine.find_all(module_name, state, max_depth)
91
+ end
92
+
93
+ # Find one visited url with given module process state
94
+ # @param [ String ] module_name
95
+ # @param [ String ] state one of unprocessed, processing, processed
96
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
97
+ # @return [ String, nil ] URL
98
+ def find_one(module_name, state, max_depth = -1)
99
+ @engine.find_one(module_name, state, max_depth)
100
+ end
101
+
102
+ # Get next unprocessed a url and mark it as processing in atomic
103
+ # @param [ String ] module_name
104
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
105
+ # @return [ String, nil ] URL or nil if url doesn't exists
106
+ def next_unprocessed(module_name, max_depth = -1)
107
+ @engine.next_unprocessed(module_name, max_depth)
108
+ end
109
+
110
+ # Get list of unvisited URL
111
+ # @param [ Fixnum ] max_depth maximum depth of url return
112
+ # @return [ Array ] unvisited url with maximum depth (option)
113
+ def find_unvisited(max_depth = -1)
114
+ @engine.find_unvisited(max_depth)
115
+ end
116
+
117
+ # Add URL to queue
118
+ # @param [ String ] url
119
+ # @param [ String ] ref_url reference url
120
+ def add(url, ref_url = '')
121
+ url = normalize_url url
122
+ if ref_url != ''
123
+ ref_url = normalize_url ref_url
124
+ end
125
+ @engine.add(url, ref_url)
126
+ end
127
+
128
+ # Clear URLQueue
129
+ # @return [ Fixnum ] number of urls removed
130
+ def clear
131
+ @engine.clear
132
+ end
133
+
134
+ # Get all url with status
135
+ # @return [ Array ] URL list
136
+ def all
137
+ @engine.all
138
+ end
139
+
140
+ def normalize_url(url)
141
+ if (!url.start_with? "http")
142
+ "http://" + url
143
+ else
144
+ url
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end