news_crawler 0.0.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ module NewsCrawler
23
+ module Storage
24
+ module RawData
25
+ # Basic class for RawData engine.
26
+ # Subclass and implement all its method to create new RawData engine,
27
+ # you should keep methods' singature unchanged
28
+ class RawDataEngine
29
+ def self.inherited(klass)
30
+ @engine_list = (@engine_list || []) + [klass]
31
+ end
32
+
33
+ # Get engine list
34
+ # @return [ Array ] list of url queue engines
35
+ def self.get_engines
36
+ @engine_list = @engine_list || []
37
+ @engine_list.inject({}) do | memo, klass |
38
+ memo[klass::NAME.intern] = klass
39
+ memo
40
+ end
41
+ end
42
+
43
+ # Add entry to raw data collection
44
+ # param [ String ] url
45
+ # param [ String ] body
46
+ def add(url, body)
47
+ raise NotImplementedError
48
+ end
49
+
50
+ # Find document with correspond url
51
+ # @param [ String ] url
52
+ # @return [ String, nil ]
53
+ def find_by_url(url)
54
+ raise NotImplementedError
55
+ end
56
+
57
+ def count
58
+ raise NotImplementedError
59
+ end
60
+
61
+ def clear
62
+ raise NotImplementedError
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,74 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'simpleconfig'
24
+
25
+ require 'news_crawler/storage/raw_data/mongo_storage'
26
+ require 'news_crawler/storage/raw_data/raw_data_engine'
27
+
28
+ module NewsCrawler
29
+ module Storage
30
+ # store raw data from website
31
+ module RawData
32
+ class << self
33
+ # Set RawData storage engine
34
+ # @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
35
+ # @param [ Hash ] opts options pass to engine
36
+ # This can be
37
+ # * `:mongo`, `:mongodb` for MongoDB backend
38
+ def set_engine(engine, *opts)
39
+ if engine.respond_to? :intern
40
+ engine = engine.intern
41
+ end
42
+ engine_class = RawDataEngine.get_engines[engine]
43
+ if engine_class
44
+ @engine = engine_class.new(*opts)
45
+ else
46
+ @engine = engine
47
+ end
48
+ end
49
+
50
+ # Add entry to raw data collection
51
+ # param [ String ] url
52
+ # param [ String ] body
53
+ def add(url, body)
54
+ @engine.add(url, body)
55
+ end
56
+
57
+ # Find document with correspond url
58
+ # @param [ String ] url
59
+ # @return [ String, nil ]
60
+ def find_by_url(url)
61
+ @engine.find_by_url url
62
+ end
63
+
64
+ def count
65
+ @engine.count
66
+ end
67
+
68
+ def clear
69
+ @engine.clear
70
+ end
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,218 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'mongo'
24
+ require 'news_crawler/storage/url_queue/url_queue_error'
25
+ require 'news_crawler/storage/url_queue/url_queue_engine'
26
+
27
+ module NewsCrawler
28
+ module Storage
29
+ module URLQueue
30
+ # List storage engine with MongoDB backend
31
+ class MongoEngine < NewsCrawler::Storage::URLQueue::URLQueueEngine
32
+ NAME = 'mongo'
33
+
34
+ require 'mongo'
35
+ include Mongo
36
+
37
+ # Construct a queue
38
+ def initialize(*opts)
39
+ config = SimpleConfig.for :application
40
+ db = MongoClient.new(config.mongodb.host, config.mongodb.port,
41
+ pool_size: 4,
42
+ pool_timeout: 5)[config.mongodb.db_name]
43
+ coll_name = config.prefix + '_' + config.suffix.url_queue
44
+ h_opts = ((opts[-1].is_a? Hash) ? opts[-1] : {})
45
+ @coll = db[h_opts[:coll_name] || coll_name]
46
+ @coll.ensure_index({:url => Mongo::ASCENDING}, {:unique => true})
47
+ end
48
+
49
+ # Add an URL to list with reference URL
50
+ # @param [ String ] url
51
+ # @param [ String ] ref_url
52
+ def add(url, ref_url = '')
53
+ if (ref_url == '')
54
+ depth = 0
55
+ else
56
+ depth = (get_url_depth(ref_url) || 0) + 1
57
+ end
58
+ begin
59
+ @coll.insert({:url => url,
60
+ :depth => depth,
61
+ :visited => false})
62
+ rescue Mongo::OperationFailure => e
63
+ if e.error_code == 11000 # duplicate key error
64
+ raise DuplicateURLError, url
65
+ else
66
+ raise e
67
+ end
68
+ end
69
+ end
70
+
71
+ # Mark an URL as visited
72
+ # @param [ String ] url
73
+ def mark_visited(url)
74
+ @coll.update({:url => url},
75
+ {:$set => {'visited' => true}})
76
+ end
77
+
78
+ # Mark all URLs as unvisited
79
+ def mark_all_unvisited
80
+ @coll.update({},
81
+ {:$set => {'visited' => false}},
82
+ {:multi => true})
83
+ end
84
+
85
+ # # Mark an URL as processed
86
+ # # @param [ String ] url
87
+ # def mark_processed(url, **opts)
88
+ # @coll.update({:url => url},
89
+ # {:$set => {:processed => true}})
90
+ # end
91
+
92
+ # Set processing state of url in given module
93
+ # @param [ String ] module_name
94
+ # @param [ String ] url
95
+ # @param [ String ] state one of unprocessed, processing, processed
96
+ def mark(module_name, url, state)
97
+ @coll.update({:url => url},
98
+ {:$set => {module_name => state}})
99
+ end
100
+
101
+ # Change all url in an state to other state
102
+ # @param [ String ] module_name
103
+ # @param [ String ] new_state new state
104
+ # @param [ String ] orig_state original state
105
+ def mark_all(module_name, new_state, orig_state = nil)
106
+ selector = (orig_state.nil? ? {} : {module_name => orig_state})
107
+ @coll.update(selector,
108
+ {:$set => {module_name => new_state}},
109
+ :multi => true)
110
+ end
111
+
112
+ # Get all URL and status
113
+ # @return [ Array ] array of hash contains url and status
114
+ def all(*opts)
115
+ @coll.find.collect do | entry |
116
+ entry.each_key.inject({}) do | memo, key |
117
+ if key != '_id'
118
+ memo[key.intern] = entry[key]
119
+ end
120
+ memo
121
+ end
122
+ end
123
+ end
124
+
125
+ # TODO fix bug - find *visited* url
126
+ # Find all visited urls with given module process state
127
+ # @param [ String ] modul_name
128
+ # @param [ String ] state one of unprocessed, processing, processed
129
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
130
+ # @return [ Array ] URL list
131
+ def find_all(modul_name, state, max_depth = -1)
132
+ if (state == URLQueue::UNPROCESSED)
133
+ selector = {:$or => [{modul_name => state},
134
+ {modul_name => {:$exists => false}}]}
135
+ else
136
+ selector = {modul_name => state}
137
+ end
138
+ selector = {:$and => [selector,
139
+ {'visited' => true}]}
140
+ if max_depth > -1
141
+ selector[:$and] << {'depth' => {:$lte => max_depth}}
142
+ end
143
+ @coll.find(selector).collect do | entry |
144
+ entry['url']
145
+ end
146
+ end
147
+
148
+ # Find one visited url with given module process state
149
+ # @param [ String ] modul_name
150
+ # @param [ String ] state one of unprocessed, processing, processed
151
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
152
+ # @return [ String, nil ] URL or nil if cann't found url matches criterial
153
+ def find_one(modul_name, state, max_depth = -1)
154
+ a = find_all(modul_name, state, max_depth)
155
+ if a.size > 0
156
+ a[0]
157
+ else
158
+ nil
159
+ end
160
+ end
161
+
162
+ # Get next unprocessed a url and mark it as processing in atomic
163
+ # @param [ String ] modul_name
164
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
165
+ # @return [ String, nil ] URL or nil if url doesn't exists
166
+ def next_unprocessed(modul_name, max_depth = -1)
167
+ selector = {:$or => [{modul_name => URLQueue::UNPROCESSED},
168
+ {modul_name => {:$exists => false}}]}
169
+ selector = {:$and => [selector,
170
+ {'visited' => true}]}
171
+ if max_depth > -1
172
+ selector[:$and] << {'depth' => {:$lte => max_depth}}
173
+ end
174
+ doc = @coll.find_and_modify(:query => selector,
175
+ :update => {:$set =>
176
+ {modul_name => URLQueue::PROCESSING}})
177
+ if doc.nil?
178
+ nil
179
+ else
180
+ doc['url']
181
+ end
182
+ (doc.nil? ? nil : doc['url'])
183
+ end
184
+ alias :find_and_mark :next_unprocessed
185
+
186
+ # Get list of unvisited URL
187
+ # @param [ Fixnum ] max_depth maximum depth of url return
188
+ # @return [ Array ] unvisited url with maximum depth (option)
189
+ def find_unvisited(max_depth = -1)
190
+ if max_depth > -1
191
+ selector = {:$and => [{'visited' => false},
192
+ {'depth' => {:$lte => max_depth}}]}
193
+ else
194
+ selector = {'visited' => false}
195
+ end
196
+ @coll.find(selector).collect do | entry |
197
+ entry['url']
198
+ end
199
+ end
200
+
201
+ # Clear URL queue
202
+ # @return [ Fixnum ] number of urls removed
203
+ def clear(*opts)
204
+ count = @coll.count
205
+ @coll.remove
206
+ count
207
+ end
208
+
209
+ # Get URL depth of given url
210
+ # @param [ String ] url
211
+ # return [ Fixnum ] URL depth
212
+ def get_url_depth(url)
213
+ @coll.find_one({'url' => url}, {:fields => ['depth']})['depth']
214
+ end
215
+ end
216
+ end
217
+ end
218
+ end
@@ -0,0 +1,124 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ module NewsCrawler
24
+ module Storage
25
+ module URLQueue
26
+ # Basic class for URLQueue engine.
27
+ # Subclass and implement all its method to create new URLQueue engine,
28
+ # you should keep methods' singature unchanged
29
+ class URLQueueEngine
30
+ def self.inherited(klass)
31
+ @engine_list = (@engine_list || []) + [klass]
32
+ end
33
+
34
+ # Get engine list
35
+ # @return [ Array ] list of url queue engines
36
+ def self.get_engines
37
+ @engine_list = @engine_list || []
38
+ @engine_list.inject({}) do | memo, klass |
39
+ memo[klass::NAME.intern] = klass
40
+ memo
41
+ end
42
+ end
43
+
44
+ # Set processing state of url in given module
45
+ # @param [ String ] module_name
46
+ # @param [ String ] url
47
+ # @param [ String ] state one of unprocessed, processing, processed
48
+ def mark(module_name, url, state)
49
+ raise NotImplementedError
50
+ end
51
+
52
+ # Change all url in an state to other state
53
+ # @param [ String ] module_name
54
+ # @param [ String ] new_state new state
55
+ # @param [ String ] orig_state original state
56
+ def mark_all(module_name, new_state, orig_state = nil)
57
+ raise NotImplementedError
58
+ end
59
+
60
+ # Produce next unprocessed url and mark it as processing
61
+ # @param [ String ] module_name
62
+ # @return [ String, nil ]
63
+ def next_unprocessed(module_name)
64
+ raise NotImplementedError
65
+ end
66
+
67
+ # Find all visited urls with module's state
68
+ # @param [ String ] module_name
69
+ # @param [ String ] state
70
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
71
+ # @return [ Array ] URL list
72
+ def find_all(module_name, state, max_depth = -1)
73
+ raise NotImplementedError
74
+ end
75
+
76
+ # Find one visited url with given module process state
77
+ # @param [ String ] module_name
78
+ # @param [ String ] state one of unprocessed, processing, processed
79
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
80
+ # @return [ String, nil ] URL
81
+ def find_one(module_name, state, max_depth = -1)
82
+ raise NotImplementedError
83
+ end
84
+
85
+ # Get list of unvisited URL
86
+ # @param [ Fixnum ] max_depth maximum depth of url return
87
+ # @return [ Array ] unvisited url with maximum depth (option)
88
+ def find_unvisited(max_depth = -1)
89
+ raise NotImplementedError
90
+ end
91
+
92
+ # Add url with reference url
93
+ # @param [ String ] url URL
94
+ # @param [ String ] ref_url reference URL
95
+ def add(url, ref_url = '')
96
+ raise NotImplementedError
97
+ end
98
+
99
+ # Clear URLQueue
100
+ # @return [ Fixnum ] number of urls removed
101
+ def clear
102
+ raise NotImplementedError
103
+ end
104
+
105
+ # Mark an URL as visited
106
+ # @param [ String ] url
107
+ def mark_visited(url)
108
+ raise NotImplementedError
109
+ end
110
+
111
+ # Mark all URLs as unvisited
112
+ def mark_all_unvisited
113
+ raise NotImplementedError
114
+ end
115
+
116
+ # Get all url with status
117
+ # @return [ Array ] URL list
118
+ def all
119
+ raise NotImplementedError
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,28 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ module NewsCrawler
23
+ module Storage
24
+ module URLQueue
25
+ class DuplicateURLError < StandardError; end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,150 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'simple_config'
24
+ require 'news_crawler/storage/url_queue/mongo_storage'
25
+ require 'news_crawler/storage/url_queue/url_queue_engine'
26
+
27
+ module NewsCrawler
28
+ module Storage
29
+ # Store and manipulate url queue
30
+ module URLQueue
31
+ ACTION_LIST = [:mark_visited, :mark_processed, :find_unvisited,
32
+ :find_unprocessed, :find_unprocessed_with_depth]
33
+ PROCESSED = 'processed'
34
+ PROCESSING = 'processing'
35
+ UNPROCESSED = 'unprocessed'
36
+
37
+ class << self
38
+ # Set URLQueue storage engine
39
+ # @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
40
+ # @param [ Hash ] opts options pass to engine
41
+ # This can be
42
+ # * `:mongo`, `:mongodb` for MongoDB backend
43
+ def set_engine(engine, *opts)
44
+ if engine.respond_to? :intern
45
+ engine = engine.intern
46
+ end
47
+ engine_class = URLQueueEngine.get_engines[engine]
48
+ if engine_class
49
+ @engine = engine_class.new(*opts)
50
+ else
51
+ @engine = engine
52
+ end
53
+ end
54
+
55
+ # Mark an URL as visited
56
+ # @param [ String ] url
57
+ def mark_visited(url)
58
+ url = normalize_url url
59
+ @engine.mark_visited(url)
60
+ end
61
+
62
+ # Mark all URLs as unvisited
63
+ def mark_all_unvisited
64
+ @engine.mark_all_unvisited
65
+ end
66
+
67
+ # Set processing state of url in given module
68
+ # @param [ String ] module_name
69
+ # @param [ String ] url
70
+ # @param [ String ] state one of unprocessed, processing, processed
71
+ def mark(module_name, url, state)
72
+ url = normalize_url url
73
+ @engine.mark(module_name, url, state)
74
+ end
75
+
76
+ # Mark all url to state
77
+ # @param [ String ] module_name
78
+ # @param [ String ] new_state new state
79
+ # @param [ String ] orig_state original state
80
+ def mark_all(module_name, new_state, orig_state = nil)
81
+ @engine.mark_all(module_name, new_state, orig_state)
82
+ end
83
+
84
+ # Find all visited urls with module's state
85
+ # @param [ String ] module_name
86
+ # @param [ String ] state
87
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
88
+ # @return [ Array ] URL list
89
+ def find_all(module_name, state, max_depth = -1)
90
+ @engine.find_all(module_name, state, max_depth)
91
+ end
92
+
93
+ # Find one visited url with given module process state
94
+ # @param [ String ] module_name
95
+ # @param [ String ] state one of unprocessed, processing, processed
96
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
97
+ # @return [ String, nil ] URL
98
+ def find_one(module_name, state, max_depth = -1)
99
+ @engine.find_one(module_name, state, max_depth)
100
+ end
101
+
102
+ # Get next unprocessed a url and mark it as processing in atomic
103
+ # @param [ String ] module_name
104
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
105
+ # @return [ String, nil ] URL or nil if url doesn't exists
106
+ def next_unprocessed(module_name, max_depth = -1)
107
+ @engine.next_unprocessed(module_name, max_depth)
108
+ end
109
+
110
+ # Get list of unvisited URL
111
+ # @param [ Fixnum ] max_depth maximum depth of url return
112
+ # @return [ Array ] unvisited url with maximum depth (option)
113
+ def find_unvisited(max_depth = -1)
114
+ @engine.find_unvisited(max_depth)
115
+ end
116
+
117
+ # Add URL to queue
118
+ # @param [ String ] url
119
+ # @param [ String ] ref_url reference url
120
+ def add(url, ref_url = '')
121
+ url = normalize_url url
122
+ if ref_url != ''
123
+ ref_url = normalize_url ref_url
124
+ end
125
+ @engine.add(url, ref_url)
126
+ end
127
+
128
+ # Clear URLQueue
129
+ # @return [ Fixnum ] number of urls removed
130
+ def clear
131
+ @engine.clear
132
+ end
133
+
134
+ # Get all url with status
135
+ # @return [ Array ] URL list
136
+ def all
137
+ @engine.all
138
+ end
139
+
140
+ def normalize_url(url)
141
+ if (!url.start_with? "http")
142
+ "http://" + url
143
+ else
144
+ url
145
+ end
146
+ end
147
+ end
148
+ end
149
+ end
150
+ end