news_crawler 0.0.0.pre.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3cdcc66767575d17c8a1f5d25f00713f74d51e98
4
+ data.tar.gz: 7c7d2c838066a45c4365a0756625e4a08c5d4482
5
+ SHA512:
6
+ metadata.gz: 75f259b0cafbff494302955a1b7968d331d69298c8e5a31185b0b8288408f7af323721eb83a2043005e2acb7e12337051622e191356da497892eb35701dfe7d8
7
+ data.tar.gz: 4382b0ed8d4bc28134d8af322f9a2f46b95a8732b2629835ae0b49da4e3ecdaa6fcce671c7fd5adc8cc198fb8e107360678277767b9cc9d706bae7a6530b2dd4
data/bin/news_crawler ADDED
@@ -0,0 +1,94 @@
1
+ #! /usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ #--
5
+ # NewsCrawler - a website crawler
6
+ #
7
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
8
+ #
9
+ # This file is part of NewsCrawler.
10
+ #
11
+ # NewsCrawler is free software: you can redistribute it and/or modify
12
+ # it under the terms of the GNU General Public License as published by
13
+ # the Free Software Foundation, either version 3 of the License, or
14
+ # (at your option) any later version.
15
+ #
16
+ # NewsCrawler is distributed in the hope that it will be useful,
17
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ # GNU General Public License for more details.
20
+ #
21
+ # You should have received a copy of the GNU General Public License
22
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
23
+ #++
24
+
25
+ require 'optparse'
26
+ require 'news_crawler/config'
27
+ require 'news_crawler/nc_logger'
28
+
29
+ require 'news_crawler/downloader'
30
+ require 'news_crawler/link_selector/same_domain_selector'
31
+
32
+ include NewsCrawler::Storage
33
+
34
+ options = {}
35
+
36
+ OptionParser.new do | opts |
37
+ opts.banner = "Usage: news_crawler [options] url"
38
+
39
+ opts.on('-c', "--app-conf FILE", "Application configuration file") do | f |
40
+ options[:app_conf] = File.expand_path(f)
41
+ raise Errno::ENOENT unless File.exists? options[:app_conf]
42
+ end
43
+
44
+ opts.on('-sds', "--sds-conf FILE", "Same domain selector configuration file") do | f |
45
+ options[:sds_conf] = File.expand_path(f)
46
+ raise Errno::ENOENT unless File.exists? options[:sds_conf]
47
+ end
48
+
49
+ opts.on('-c', '--[no-]cleardb', "Clear database") do | cd |
50
+ options[:cleardb] = cd
51
+ end
52
+
53
+ opts.on('-d', "--max-depth DEPTH", OptionParser::DecimalInteger,
54
+ 'Maximum depth of url to crawl') do | d |
55
+ options[:max_depth] = d
56
+ end
57
+ end.parse!
58
+
59
+
60
+ NewsCrawler::CrawlerConfig.load_application_config(options[:app_conf]) unless options[:app_conf].nil?
61
+ NewsCrawler::CrawlerConfig.load_samedomainselector_config(options[:sds_conf]) unless options[:sds_conf].nil?
62
+
63
+ config = SimpleConfig.for :application
64
+ NewsCrawler::Storage::RawData.set_engine(config.db.engine.intern)
65
+ NewsCrawler::Storage::URLQueue.set_engine(config.db.engine.intern)
66
+
67
+ if (options[:cleardb])
68
+ URLQueue.clear
69
+ RawData.clear
70
+ end
71
+
72
+ if ARGV.size > 0
73
+ url = ARGV[0]
74
+ URLQueue.add(url)
75
+ end
76
+
77
+ puts "Starting Downloader"
78
+ dwl = NewsCrawler::Downloader.new(false)
79
+ dwl.async.run
80
+
81
+ puts "Starting SDS"
82
+ se = NewsCrawler::LinkSelector::SameDomainSelector.new(options[:max_depth] || 1, false)
83
+ se.async.run
84
+ puts "Stoping SDS"
85
+ se.graceful_terminate
86
+ se.terminate
87
+ puts "SDS stopped"
88
+
89
+ sleep(5)
90
+
91
+ puts "Stoping Downloader"
92
+ dwl.graceful_terminate
93
+ dwl.terminate
94
+ puts "Downloader stopped"
@@ -0,0 +1,33 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ require 'simple_config'
23
+ require 'mongo'
24
+
25
+ require 'news_crawler/config'
26
+ require 'news_crawler/storage/url_queue'
27
+
28
+ SimpleConfig.for(:application) do
29
+ set :prefix, 'test'
30
+ end
31
+
32
+ config = SimpleConfig.for :application
33
+ NewsCrawler::Storage::URLQueue.set_engine(config.db.engine.intern)
@@ -0,0 +1,53 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'simple_config'
24
+
25
+ module NewsCrawler
26
+ class CrawlerConfig
27
+ DEFAULT_CONFIG = File.join(File.dirname(__FILE__),
28
+ './default_config.yml')
29
+ DEFAULT_SDS_CONFIG = File.join(File.dirname(__FILE__),
30
+ './default_sds.yml')
31
+
32
+ def self.load_application_config(file = CrawlerConfig::DEFAULT_CONFIG)
33
+ if ((file != DEFAULT_CONFIG) || (@app_loaded != true))
34
+ @app_loaded = true
35
+ SimpleConfig.for :application do
36
+ load file
37
+ end
38
+ end
39
+ end
40
+
41
+ def self.load_samedomainselector_config(file = CrawlerConfig::DEFAULT_SDS_CONFIG)
42
+ if ((file != DEFAULT_SDS_CONFIG) || (@sds_loaded != true))
43
+ @sds_loaded = true
44
+ SimpleConfig.for :same_domain_selector do
45
+ load file
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ NewsCrawler::CrawlerConfig.load_application_config
53
+ NewsCrawler::CrawlerConfig.load_samedomainselector_config
@@ -0,0 +1,70 @@
1
+ #--
2
+ # NewsCrawler - a website crawler
3
+ #
4
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
5
+ #
6
+ # This file is part of NewsCrawler.
7
+ #
8
+ # NewsCrawler is free software: you can redistribute it and/or modify
9
+ # it under the terms of the GNU General Public License as published by
10
+ # the Free Software Foundation, either version 3 of the License, or
11
+ # (at your option) any later version.
12
+ #
13
+ # NewsCrawler is distributed in the hope that it will be useful,
14
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
+ # GNU General Public License for more details.
17
+ #
18
+ # You should have received a copy of the GNU General Public License
19
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
20
+ #++
21
+
22
+ require 'news_crawler/storage/url_queue'
23
+ require 'thread'
24
+
25
+ module NewsCrawler
26
+ # Include this to get basic module methods
27
+ module CrawlerModule
28
+ # Mark current url process state of current module is processed
29
+ # @param [ String ] url
30
+ def mark_processed(url)
31
+ URLQueue.mark(self.class.name, url, URLQueue::PROCESSED)
32
+ end
33
+
34
+ # Mark current url process state of current module is unprocessed
35
+ # @param [ String ] url
36
+ def mark_unprocessed(url)
37
+ URLQueue.mark(self.class.name, url, URLQueue::UNPROCESSED)
38
+ end
39
+
40
+ # Find all visited unprocessed url
41
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
42
+ # @return [ Array ] URL list
43
+ def find_unprocessed(max_depth = -1)
44
+ URLQueue.find_all(self.class.name, URLQueue::UNPROCESSED, max_depth)
45
+ end
46
+
47
+ # Find one visited url with given current module process state
48
+ # @param [ String ] state one of unprocessed, processing, processed
49
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
50
+ # @return [ Array ] URL list
51
+ def find_all(state, max_depth = -1)
52
+ URLQueue.find_all(self.class.name, state, max_depth)
53
+ end
54
+
55
+ # Find all visited urls with current module's state
56
+ # @param [ String ] state
57
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
58
+ # @return [ String, nil ] URL or nil if url doesn't exists
59
+ def find_one(state, max_depth = -1)
60
+ URLQueue.find_one(self.class.name, state, max_depth)
61
+ end
62
+
63
+ # Get next unprocessed a url and mark it as processing in atomic
64
+ # @param [ Fixnum ] max_depth max url depth return (inclusive)
65
+ # @return [ String, nil ] URL or nil if url doesn't exists
66
+ def next_unprocessed(max_depth = -1)
67
+ URLQueue.next_unprocessed(self.class.name, max_depth)
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,13 @@
1
+ db:
2
+ :engine: :mongo
3
+
4
+ :mongodb:
5
+ :host: localhost
6
+ :port: !str 27017
7
+ :db_name: news-crawler
8
+
9
+ :suffix:
10
+ :raw_data: raw_data
11
+ :url_queue: url_queue
12
+
13
+ prefix: ''
@@ -0,0 +1 @@
1
+ :exclude:
@@ -0,0 +1,112 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'celluloid'
24
+ require 'typhoeus'
25
+ require 'simpleconfig'
26
+
27
+ require 'news_crawler/config'
28
+ require 'news_crawler/storage/raw_data'
29
+ require 'news_crawler/utils/robots_patch'
30
+ require 'news_crawler/nc_logger'
31
+
32
+ module NewsCrawler
33
+ # This class implement an parallel downloader based on Typhoes
34
+ # with given queue
35
+ class Downloader
36
+ include Celluloid
37
+
38
+ CONCURRENT_DOWNLOAD = 4
39
+
40
+ # Construct downloader with an URLQueue
41
+ # @param [ Boolean ] start_on_create whether start selector immediately
42
+ # @param [ NewsCrawler::URLQueue ] queue url queue
43
+ def initialize(start_on_create = true, queue = NewsCrawler::Storage::URLQueue, **opts)
44
+ @queue = queue
45
+ @urls = queue.find_unvisited
46
+ @concurrent_download = opts[:concurrent] || CONCURRENT_DOWNLOAD
47
+ @wait_time = 1
48
+ @status = :running
49
+ @stoping = false
50
+ wait_for_url if start_on_create
51
+ end
52
+
53
+ # Start downloader with current queue
54
+ # URL successed fetch is marked and result's stored in DB
55
+ def run
56
+ @status = :running
57
+ hydra = Typhoeus::Hydra.new(max_concurrency: @concurrent_download)
58
+ # TODO Log here
59
+ @urls = @urls.keep_if do | url |
60
+ Robots.instance.allowed? url
61
+ end
62
+ requests = @urls.map do | url |
63
+ re = Typhoeus::Request.new(url, followlocation: true)
64
+ re.on_complete do | response |
65
+ if response.success?
66
+ Storage::RawData.add(url, response.response_body)
67
+ @queue.mark_visited url
68
+ else
69
+ NCLogger.get_logger.warn("[WARNING] Fetch error [#{url}]")
70
+ end
71
+ end
72
+ hydra.queue re
73
+ re
74
+ end
75
+ hydra.run
76
+ @urls = []
77
+ wait_for_url
78
+ end
79
+
80
+ # Graceful terminate this downloader
81
+ def graceful_terminate
82
+ @stoping = true
83
+ while @status == :running
84
+ sleep(1)
85
+ end
86
+ end
87
+
88
+ private
89
+ # Waiting for new urls're added to queue, using backoff algorithms
90
+ def wait_for_url
91
+ @status = :waiting
92
+ if @stoping # check for stop flag
93
+ return
94
+ end
95
+ sleep @wait_time
96
+ get_new_url
97
+ if @urls.size == 0
98
+ if @wait_time < 30
99
+ @wait_time = @wait_time * 2
100
+ end
101
+ wait_for_url
102
+ else
103
+ @wait_time = 1
104
+ run
105
+ end
106
+ end
107
+
108
+ def get_new_url
109
+ @urls = @queue.find_unvisited
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,172 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'celluloid'
24
+ require 'nokogiri'
25
+
26
+ require 'news_crawler/storage/raw_data'
27
+ require 'news_crawler/url_helper'
28
+ require 'news_crawler/crawler_module'
29
+ require 'news_crawler/nc_logger'
30
+
31
+ module NewsCrawler
32
+ module LinkSelector
33
+ # Select all link from same domain.
34
+ # Domain is got from database
35
+ class SameDomainSelector
36
+ include NewsCrawler::URLHelper
37
+ extend NewsCrawler::URLHelper
38
+
39
+ include NewsCrawler::CrawlerModule
40
+ include Celluloid
41
+
42
+ # Create new selector with queue
43
+ # URL's selected is put back into queue
44
+ # @param [ Fixnum ] max_depth maxinum depth to crawl
45
+ # @param [ Boolean ] start_on_create whether start selector immediately
46
+ def initialize(max_depth = -1, start_on_create = true)
47
+ @max_depth = max_depth
48
+ @wait_time = 1
49
+ @status = :running
50
+ @stoping = false
51
+ run if start_on_create
52
+ end
53
+
54
+ # Extract url from page
55
+ def extract_url(url)
56
+ doc = RawData.find_by_url(url)
57
+ html_doc = Nokogiri::HTML(doc)
58
+ results = []
59
+
60
+ inner_url = html_doc.xpath('//a').collect { | a_el |
61
+ temp_url = (a_el.attribute 'href').to_s
62
+ if (!temp_url.nil?) && (temp_url[0] == '/')
63
+ temp_url = url + temp_url
64
+ end
65
+ temp_url
66
+ }
67
+
68
+ inner_url.delete_if { | url |
69
+ (url.nil?) || (url.size == 0) || (url == '#')
70
+ }
71
+
72
+ # select url from same domain
73
+ inner_url.select { | o_url |
74
+ if (same_domain?(o_url, url))
75
+ if (!SameDomainSelector.exclude?(o_url))
76
+ begin
77
+ URLQueue.add(o_url, url)
78
+ results << [o_url, url]
79
+ rescue URLQueue::DuplicateURLError => e
80
+ end
81
+ else
82
+ # TODO Log here
83
+ end
84
+ end
85
+ }
86
+ end
87
+
88
+ def run
89
+ @status = :running
90
+ return if @stoping
91
+ if @max_depth == 0
92
+ @status = :stopped
93
+ return
94
+ end
95
+ while !@stoping
96
+ url = next_unprocessed(@max_depth - 1)
97
+ while (url.nil?)
98
+ wait_for_url
99
+ url = next_unprocessed(@max_depth - 1)
100
+ end
101
+ NCLogger.get_logger.info "Processing #{url}"
102
+ extract_url(url)
103
+ mark_processed(url)
104
+ end
105
+ end
106
+
107
+ # Test whether url is excluded
108
+ # @param [ String ] url
109
+ # @return [ Boolean ] true if url is excluded, false otherwise
110
+ def self.exclude?(url)
111
+ config = SimpleConfig.for :same_domain_selector
112
+ exclude_list = []
113
+ url_domain = get_url_path(url)[:domain]
114
+ begin
115
+ exclude_group = config.exclude
116
+ rescue NoMethodError => e
117
+ return false
118
+ end
119
+
120
+ exclude_group.to_hash.keys.each do | url_e |
121
+ if url_domain.to_s.end_with? url_e.to_s
122
+ exclude_list = config.exclude.get(url_e)
123
+ break
124
+ end
125
+ end
126
+
127
+ exclude_list = exclude_list.map do | elt |
128
+ if /^\/.*\/$/ =~ elt
129
+ Regexp.new(elt[1..-2]) # already an Regex
130
+ else
131
+ new_elt = "^(.*/)?#{elt}(/.*)?$"
132
+ Regexp.new(new_elt)
133
+ end
134
+ end
135
+
136
+ if exclude_list.count == 0
137
+ return false
138
+ end
139
+
140
+ # url.split('/').each do | part |
141
+ # if exclude_list.include? part
142
+ # return true
143
+ # end
144
+ # end
145
+ exclude_list.each do | exclude_rule |
146
+ if exclude_rule =~ url
147
+ return true
148
+ end
149
+ end
150
+ return false
151
+ end
152
+
153
+ # Graceful terminate this selector
154
+ def graceful_terminate
155
+ @stoping = true
156
+ while @status == :running
157
+ sleep(1)
158
+ end
159
+ end
160
+
161
+ private
162
+ # Waiting for new urls're added to queue, using backoff algorithms
163
+ def wait_for_url
164
+ @status = :waiting
165
+ sleep @wait_time
166
+ if @wait_time < 30
167
+ @wait_times = @wait_time * 2
168
+ end
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,49 @@
1
+ #! /usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ #--
5
+ # NewsCrawler - a website crawler
6
+ #
7
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
8
+ #
9
+ # This file is part of NewsCrawler.
10
+ #
11
+ # NewsCrawler is free software: you can redistribute it and/or modify
12
+ # it under the terms of the GNU General Public License as published by
13
+ # the Free Software Foundation, either version 3 of the License, or
14
+ # (at your option) any later version.
15
+ #
16
+ # NewsCrawler is distributed in the hope that it will be useful,
17
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
18
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19
+ # GNU General Public License for more details.
20
+ #
21
+ # You should have received a copy of the GNU General Public License
22
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
23
+ #++
24
+
25
+ require 'logger'
26
+
27
+ module NewsCrawler
28
+ class NCLogger
29
+ # Get logger
30
+ def self.get_logger
31
+ @logger ||= Logger.new(STDERR)
32
+ @logger.progname = 'news_crawler'
33
+ @logger
34
+ end
35
+
36
+ # Set logger level
37
+ # param [ Logger::Severity ] l level
38
+ def self.set_level(l)
39
+ get_logger.level = l
40
+ end
41
+
42
+ # Set logger, should same API as Ruby Logger
43
+ # param [ Object ] l logger
44
+ def self.set_logdev(ld)
45
+ @logger = Logger.new(ld)
46
+ @logger.progname = 'news_crawler'
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,77 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'mongo'
24
+ require 'simple_config'
25
+ require 'news_crawler/storage/raw_data/raw_data_engine'
26
+
27
+
28
+ module NewsCrawler
29
+ module Storage
30
+ module RawData
31
+ # Raw data storage implement using MongoDB
32
+ class MongoStorage < NewsCrawler::Storage::RawData::RawDataEngine
33
+ NAME = 'mongo'
34
+
35
+ include Mongo
36
+
37
+ def initialize(*opts)
38
+ config = (SimpleConfig.for :application)
39
+ client = MongoClient.new(config.mongodb.host, config.mongodb.port)
40
+ db = client[config.mongodb.db_name]
41
+ @coll = db[config.prefix + '_' + config.suffix.raw_data]
42
+ @coll.ensure_index({:url => Mongo::ASCENDING}, {:unique => true})
43
+ end
44
+
45
+ # Add entry to raw data collection, overwrite old data
46
+ # param [ String ] url
47
+ # param [ String ] body
48
+ def add(url, body)
49
+ @coll.update({:url => url},
50
+ {:$set => {:body => body}},
51
+ {:upsert => true})
52
+ end
53
+
54
+ # Find document with correspond url
55
+ # @param [ String ] url
56
+ # @return [ String, nil ]
57
+ def find_by_url(url)
58
+ result = @coll.find_one({:url => url})
59
+ if (!result.nil?)
60
+ result['body']
61
+ else
62
+ nil
63
+ end
64
+ end
65
+
66
+ # Get number of raw data entries
67
+ def count
68
+ @coll.count
69
+ end
70
+
71
+ def clear
72
+ @coll.remove
73
+ end
74
+ end
75
+ end
76
+ end
77
+ end