news_crawler 0.0.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3cdcc66767575d17c8a1f5d25f00713f74d51e98
4
+ data.tar.gz: 7c7d2c838066a45c4365a0756625e4a08c5d4482
5
+ SHA512:
6
+ metadata.gz: 75f259b0cafbff494302955a1b7968d331d69298c8e5a31185b0b8288408f7af323721eb83a2043005e2acb7e12337051622e191356da497892eb35701dfe7d8
7
+ data.tar.gz: 4382b0ed8d4bc28134d8af322f9a2f46b95a8732b2629835ae0b49da4e3ecdaa6fcce671c7fd5adc8cc198fb8e107360678277767b9cc9d706bae7a6530b2dd4
data/bin/news_crawler ADDED
@@ -0,0 +1,94 @@
1
+ #! /usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ #--
5
+ # NewsCrawler - a website crawler
6
+ #
7
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
8
+ #
9
+ # This file is part of NewsCrawler.
10
+ #
11
+ # NewsCrawler is free software: you can redistribute it and/or modify
12
+ # it under the terms of the GNU General Public License as published by
13
+ # the Free Software Foundation, either version 3 of the License, or
14
+ # (at your option) any later version.
15
+ #
16
+ # NewsCrawler is distributed in the hope that it will be useful,
17
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ # GNU General Public License for more details.
20
+ #
21
+ # You should have received a copy of the GNU General Public License
22
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
23
+ #++
24
+
25
+ require 'optparse'
26
+ require 'news_crawler/config'
27
+ require 'news_crawler/nc_logger'
28
+
29
+ require 'news_crawler/downloader'
30
+ require 'news_crawler/link_selector/same_domain_selector'
31
+
32
+ include NewsCrawler::Storage
33
+
34
+ options = {}
35
+
36
+ OptionParser.new do | opts |
37
+ opts.banner = "Usage: news_crawler [options] url"
38
+
39
+ opts.on('-c', "--app-conf FILE", "Application configuration file") do | f |
40
+ options[:app_conf] = File.expand_path(f)
41
+ raise Errno::ENOENT unless File.exists? options[:app_conf]
42
+ end
43
+
44
+ opts.on('-sds', "--sds-conf FILE", "Same domain selector configuration file") do | f |
45
+ options[:sds_conf] = File.expand_path(f)
46
+ raise Errno::ENOENT unless File.exists? options[:sds_conf]
47
+ end
48
+
49
+ opts.on('-c', '--[no-]cleardb', "Clear database") do | cd |
50
+ options[:cleardb] = cd
51
+ end
52
+
53
+ opts.on('-d', "--max-depth DEPTH", OptionParser::DecimalInteger,
54
+ 'Maximum depth of url to crawl') do | d |
55
+ options[:max_depth] = d
56
+ end
57
+ end.parse!
58
+
59
+
60
+ NewsCrawler::CrawlerConfig.load_application_config(options[:app_conf]) unless options[:app_conf].nil?
61
+ NewsCrawler::CrawlerConfig.load_samedomainselector_config(options[:sds_conf]) unless options[:sds_conf].nil?
62
+
63
+ config = SimpleConfig.for :application
64
+ NewsCrawler::Storage::RawData.set_engine(config.db.engine.intern)
65
+ NewsCrawler::Storage::URLQueue.set_engine(config.db.engine.intern)
66
+
67
+ if (options[:cleardb])
68
+ URLQueue.clear
69
+ RawData.clear
70
+ end
71
+
72
+ if ARGV.size > 0
73
+ url = ARGV[0]
74
+ URLQueue.add(url)
75
+ end
76
+
77
+ puts "Starting Downloader"
78
+ dwl = NewsCrawler::Downloader.new(false)
79
+ dwl.async.run
80
+
81
+ puts "Starting SDS"
82
+ se = NewsCrawler::LinkSelector::SameDomainSelector.new(options[:max_depth] || 1, false)
83
+ se.async.run
84
+ puts "Stoping SDS"
85
+ se.graceful_terminate
86
+ se.terminate
87
+ puts "SDS stopped"
88
+
89
+ sleep(5)
90
+
91
+ puts "Stoping Downloader"
92
+ dwl.graceful_terminate
93
+ dwl.terminate
94
+ puts "Downloader stopped"
@@ -0,0 +1,33 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ require 'simple_config'
23
+ require 'mongo'
24
+
25
+ require 'news_crawler/config'
26
+ require 'news_crawler/storage/url_queue'
27
+
28
+ SimpleConfig.for(:application) do
29
+ set :prefix, 'test'
30
+ end
31
+
32
+ config = SimpleConfig.for :application
33
+ NewsCrawler::Storage::URLQueue.set_engine(config.db.engine.intern)
@@ -0,0 +1,53 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'simple_config'
24
+
25
+ module NewsCrawler
26
+ class CrawlerConfig
27
+ DEFAULT_CONFIG = File.join(File.dirname(__FILE__),
28
+ './default_config.yml')
29
+ DEFAULT_SDS_CONFIG = File.join(File.dirname(__FILE__),
30
+ './default_sds.yml')
31
+
32
+ def self.load_application_config(file = CrawlerConfig::DEFAULT_CONFIG)
33
+ if ((file != DEFAULT_CONFIG) || (@app_loaded != true))
34
+ @app_loaded = true
35
+ SimpleConfig.for :application do
36
+ load file
37
+ end
38
+ end
39
+ end
40
+
41
+ def self.load_samedomainselector_config(file = CrawlerConfig::DEFAULT_SDS_CONFIG)
42
+ if ((file != DEFAULT_SDS_CONFIG) || (@sds_loaded != true))
43
+ @sds_loaded = true
44
+ SimpleConfig.for :same_domain_selector do
45
+ load file
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ NewsCrawler::CrawlerConfig.load_application_config
53
+ NewsCrawler::CrawlerConfig.load_samedomainselector_config
@@ -0,0 +1,70 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ require 'news_crawler/storage/url_queue'
23
+ require 'thread'
24
+
25
+ module NewsCrawler
26
+ # Include this to get basic module methods
27
+ module CrawlerModule
28
+ # Mark current url process state of current module is processed
29
+ # @param [ String ] url
30
+ def mark_processed(url)
31
+ URLQueue.mark(self.class.name, url, URLQueue::PROCESSED)
32
+ end
33
+
34
+ # Mark current url process state of current module is unprocessed
35
+ # @param [ String ] url
36
+ def mark_unprocessed(url)
37
+ URLQueue.mark(self.class.name, url, URLQueue::UNPROCESSED)
38
+ end
39
+
40
+ # Find all visited unprocessed url
41
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
42
+ # @return [ Array ] URL list
43
+ def find_unprocessed(max_depth = -1)
44
+ URLQueue.find_all(self.class.name, URLQueue::UNPROCESSED, max_depth)
45
+ end
46
+
47
+ # Find one visited url with given current module process state
48
+ # @param [ String ] state one of unprocessed, processing, processed
49
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
50
+ # @return [ Array ] URL list
51
+ def find_all(state, max_depth = -1)
52
+ URLQueue.find_all(self.class.name, state, max_depth)
53
+ end
54
+
55
+ # Find all visited urls with current module's state
56
+ # @param [ String ] state
57
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
58
+ # @return [ String, nil ] URL or nil if url doesn't exists
59
+ def find_one(state, max_depth = -1)
60
+ URLQueue.find_one(self.class.name, state, max_depth)
61
+ end
62
+
63
+ # Get next unprocessed a url and mark it as processing in atomic
64
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
65
+ # @return [ String, nil ] URL or nil if url doesn't exists
66
+ def next_unprocessed(max_depth = -1)
67
+ URLQueue.next_unprocessed(self.class.name, max_depth)
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,13 @@
1
+ db:
2
+ :engine: :mongo
3
+
4
+ :mongodb:
5
+ :host: localhost
6
+ :port: !str 27017
7
+ :db_name: news-crawler
8
+
9
+ :suffix:
10
+ :raw_data: raw_data
11
+ :url_queue: url_queue
12
+
13
+ prefix: ''
@@ -0,0 +1 @@
1
+ :exclude:
@@ -0,0 +1,112 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'celluloid'
24
+ require 'typhoeus'
25
+ require 'simpleconfig'
26
+
27
+ require 'news_crawler/config'
28
+ require 'news_crawler/storage/raw_data'
29
+ require 'news_crawler/utils/robots_patch'
30
+ require 'news_crawler/nc_logger'
31
+
32
+ module NewsCrawler
33
+ # This class implement an parallel downloader based on Typhoes
34
+ # with given queue
35
+ class Downloader
36
+ include Celluloid
37
+
38
+ CONCURRENT_DOWNLOAD = 4
39
+
40
+ # Construct downloader with an URLQueue
41
+ # @param [ Boolean ] start_on_create whether start selector immediately
42
+ # @param [ NewsCrawler::URLQueue ] queue url queue
43
+ def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
44
+ @queue = queue
45
+ @urls = queue.find_unvisited
46
+ @concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
47
+ @wait_time = 1
48
+ @status = :running
49
+ @stoping = false
50
+ wait_for_url if start_on_create
51
+ end
52
+
53
+ # Start downloader with current queue
54
+ # URL successed fetch is marked and result's stored in DB
55
+ def run
56
+ @status = :running
57
+ hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
58
+ # TODO Log here
59
+ @urls = @urls.keep_if do | url |
60
+ Robots.instance.allowed? url
61
+ end
62
+ requests = @urls.map do | url |
63
+ re = Typhoeus::Request.new(url, followlocation: true)
64
+ re.on_complete do | response |
65
+ if response.success?
66
+ Storage::RawData.add(url, response.response_body)
67
+ @queue.mark_visited url
68
+ else
69
+ NCLogger.get_logger.warn("[WARNING] Fetch error [#{url}]")
70
+ end
71
+ end
72
+ hydra.queue re
73
+ re
74
+ end
75
+ hydra.run
76
+ @urls = []
77
+ wait_for_url
78
+ end
79
+
80
+ # Graceful terminate this downloader
81
+ def graceful_terminate
82
+ @stoping = true
83
+ while @status == :running
84
+ sleep(1)
85
+ end
86
+ end
87
+
88
+ private
89
+ # Waiting for new urls're added to queue, using backoff algorithms
90
+ def wait_for_url
91
+ @status = :waiting
92
+ if @stoping # check for stop flag
93
+ return
94
+ end
95
+ sleep @wait_time
96
+ get_new_url
97
+ if @urls.size == 0
98
+ if @wait_time < 30
99
+ @wait_time = @wait_time * 2
100
+ end
101
+ wait_for_url
102
+ else
103
+ @wait_time = 1
104
+ run
105
+ end
106
+ end
107
+
108
+ def get_new_url
109
+ @urls = @queue.find_unvisited
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,172 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'celluloid'
24
+ require 'nokogiri'
25
+
26
+ require 'news_crawler/storage/raw_data'
27
+ require 'news_crawler/url_helper'
28
+ require 'news_crawler/crawler_module'
29
+ require 'news_crawler/nc_logger'
30
+
31
+ module NewsCrawler
32
+ module LinkSelector
33
+ # Select all link from same domain.
34
+ # Domain is got from database
35
+ class SameDomainSelector
36
+ include NewsCrawler::URLHelper
37
+ extend NewsCrawler::URLHelper
38
+
39
+ include NewsCrawler::CrawlerModule
40
+ include Celluloid
41
+
42
+ # Create new selector with queue
43
+ # URL's selected is put back into queue
44
+ # @param [ Fixnum ] max_depth maxinum depth to crawl
45
+ # @param [ Boolean ] start_on_create whether start selector immediately
46
+ def initialize(max_depth = -1, start_on_create = true)
47
+ @max_depth = max_depth
48
+ @wait_time = 1
49
+ @status = :running
50
+ @stoping = false
51
+ run if start_on_create
52
+ end
53
+
54
+ # Extract url from page
55
+ def extract_url(url)
56
+ doc = RawData.find_by_url(url)
57
+ html_doc = Nokogiri::HTML(doc)
58
+ results = []
59
+
60
+ inner_url = html_doc.xpath('//a').collect { | a_el |
61
+ temp_url = (a_el.attribute 'href').to_s
62
+ if (!temp_url.nil?) && (temp_url[0] == '/')
63
+ temp_url = url + temp_url
64
+ end
65
+ temp_url
66
+ }
67
+
68
+ inner_url.delete_if { | url |
69
+ (url.nil?) || (url.size == 0) || (url == '#')
70
+ }
71
+
72
+ # select url from same domain
73
+ inner_url.select { | o_url |
74
+ if (same_domain?(o_url, url))
75
+ if (!SameDomainSelector.exclude?(o_url))
76
+ begin
77
+ URLQueue.add(o_url, url)
78
+ results << [o_url, url]
79
+ rescue URLQueue::DuplicateURLError => e
80
+ end
81
+ else
82
+ # TODO Log here
83
+ end
84
+ end
85
+ }
86
+ end
87
+
88
+ def run
89
+ @status = :running
90
+ return if @stoping
91
+ if @max_depth == 0
92
+ @status = :stopped
93
+ return
94
+ end
95
+ while !@stoping
96
+ url = next_unprocessed(@max_depth - 1)
97
+ while (url.nil?)
98
+ wait_for_url
99
+ url = next_unprocessed(@max_depth - 1)
100
+ end
101
+ NCLogger.get_logger.info "Processing #{url}"
102
+ extract_url(url)
103
+ mark_processed(url)
104
+ end
105
+ end
106
+
107
+ # Test whether url is excluded
108
+ # @param [ String ] url
109
+ # @return [ Boolean ] true if url is excluded, false otherwise
110
+ def self.exclude?(url)
111
+ config = SimpleConfig.for :same_domain_selector
112
+ exclude_list = []
113
+ url_domain = get_url_path(url)[:domain]
114
+ begin
115
+ exclude_group = config.exclude
116
+ rescue NoMethodError => e
117
+ return false
118
+ end
119
+
120
+ exclude_group.to_hash.keys.each do | url_e |
121
+ if url_domain.to_s.end_with? url_e.to_s
122
+ exclude_list = config.exclude.get(url_e)
123
+ break
124
+ end
125
+ end
126
+
127
+ exclude_list = exclude_list.map do | elt |
128
+ if /^\/.*\/$/ =~ elt
129
+ Regexp.new(elt[1..-2]) # already an Regex
130
+ else
131
+ new_elt = "^(.*/)?#{elt}(/.*)?$"
132
+ Regexp.new(new_elt)
133
+ end
134
+ end
135
+
136
+ if exclude_list.count == 0
137
+ return false
138
+ end
139
+
140
+ # url.split('/').each do | part |
141
+ # if exclude_list.include? part
142
+ # return true
143
+ # end
144
+ # end
145
+ exclude_list.each do | exclude_rule |
146
+ if exclude_rule =~ url
147
+ return true
148
+ end
149
+ end
150
+ return false
151
+ end
152
+
153
+ # Graceful terminate this selector
154
+ def graceful_terminate
155
+ @stoping = true
156
+ while @status == :running
157
+ sleep(1)
158
+ end
159
+ end
160
+
161
+ private
162
+ # Waiting for new urls're added to queue, using backoff algorithms
163
+ def wait_for_url
164
+ @status = :waiting
165
+ sleep @wait_time
166
+ if @wait_time < 30
167
+ @wait_times = @wait_time * 2
168
+ end
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,49 @@
1
+ #! /usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ #--
5
+ # NewsCrawler - a website crawler
6
+ #
7
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
8
+ #
9
+ # This file is part of NewsCrawler.
10
+ #
11
+ # NewsCrawler is free software: you can redistribute it and/or modify
12
+ # it under the terms of the GNU General Public License as published by
13
+ # the Free Software Foundation, either version 3 of the License, or
14
+ # (at your option) any later version.
15
+ #
16
+ # NewsCrawler is distributed in the hope that it will be useful,
17
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ # GNU General Public License for more details.
20
+ #
21
+ # You should have received a copy of the GNU General Public License
22
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
23
+ #++
24
+
25
+ require 'logger'
26
+
27
+ module NewsCrawler
28
+ class NCLogger
29
+ # Get logger
30
+ def self.get_logger
31
+ @logger ||= Logger.new(STDERR)
32
+ @logger.progname = 'news_crawler'
33
+ @logger
34
+ end
35
+
36
+ # Set logger level
37
+ # param [ Logger::Severity ] l level
38
+ def self.set_level(l)
39
+ get_logger.level = l
40
+ end
41
+
42
+ # Set logger, should same API as Ruby Logger
43
+ # param [ Object ] l logger
44
+ def self.set_logdev(ld)
45
+ @logger = Logger.new(ld)
46
+ @logger.progname = 'news_crawler'
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,77 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'mongo'
24
+ require 'simple_config'
25
+ require 'news_crawler/storage/raw_data/raw_data_engine'
26
+
27
+
28
+ module NewsCrawler
29
+ module Storage
30
+ module RawData
31
+ # Raw data storage implement using MongoDB
32
+ class MongoStorage < NewsCrawler::Storage::RawData::RawDataEngine
33
+ NAME = 'mongo'
34
+
35
+ include Mongo
36
+
37
+ def initialize(*opts)
38
+ config = (SimpleConfig.for :application)
39
+ client = MongoClient.new(config.mongodb.host, config.mongodb.port)
40
+ db = client[config.mongodb.db_name]
41
+ @coll = db[config.prefix + '_' + config.suffix.raw_data]
42
+ @coll.ensure_index({:url => Mongo::ASCENDING}, {:unique => true})
43
+ end
44
+
45
+ # Add entry to raw data collection, overwrite old data
46
+ # param [ String ] url
47
+ # param [ String ] body
48
+ def add(url, body)
49
+ @coll.update({:url => url},
50
+ {:$set => {:body => body}},
51
+ {:upsert => true})
52
+ end
53
+
54
+ # Find document with correspond url
55
+ # @param [ String ] url
56
+ # @return [ String, nil ]
57
+ def find_by_url(url)
58
+ result = @coll.find_one({:url => url})
59
+ if (!result.nil?)
60
+ result['body']
61
+ else
62
+ nil
63
+ end
64
+ end
65
+
66
+ # Get number of raw data entries
67
+ def count
68
+ @coll.count
69
+ end
70
+
71
+ def clear
72
+ @coll.remove
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end