news_crawler 0.0.3 → 0.0.4.pre.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/news_crawler/crawler_module.rb +21 -1
- data/lib/news_crawler/default_config.yml +3 -2
- data/lib/news_crawler/processing/structure_analysis.rb +299 -0
- data/lib/news_crawler/storage/yaml_stor/mongo_storage.rb +85 -0
- data/lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb +70 -0
- data/lib/news_crawler/storage/yaml_stor.rb +78 -0
- metadata +8 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37f387711eea761bdcaa5cf16f30eb295b60652e
|
4
|
+
data.tar.gz: 127ab77d72f429a3bacf46a24367331eaadf471f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 278f80bc1f4eec78a2536ee74edebfe94b26b45a15c9577bedb7214325ad4cdeda6e64eaae002e7f5c48480fe4eaefb413e4e00bef518915de1fb8cfbb08321d
|
7
|
+
data.tar.gz: c57baf50222284a38769636c2cb4cc57e075bf8c008f32765c9dc92a2992afe6eb8d5eb32b6d5e04bf5695167c55be40c74eed4b981462d2a9a27959cc852c3b
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
#--
|
2
3
|
# NewsCrawler - a website crawler
|
3
4
|
#
|
@@ -20,7 +21,8 @@
|
|
20
21
|
#++
|
21
22
|
|
22
23
|
require 'news_crawler/storage/url_queue'
|
23
|
-
require '
|
24
|
+
require 'news_crawler/storage/yaml_stor'
|
25
|
+
|
24
26
|
|
25
27
|
module NewsCrawler
|
26
28
|
# Include this to get basic module methods
|
@@ -66,5 +68,23 @@ module NewsCrawler
|
|
66
68
|
def next_unprocessed(max_depth = -1)
|
67
69
|
URLQueue.next_unprocessed(self.class.name, max_depth)
|
68
70
|
end
|
71
|
+
|
72
|
+
def mark_all_as_unprocessed
|
73
|
+
URLQueue.mark_all(self.class.name, URLQueue::UNPROCESSED)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Serialize object to YAML and save it (overwrite if key existed)
|
77
|
+
# @param [ String ] key
|
78
|
+
# @param [ Object ] value
|
79
|
+
def save_yaml(key, value)
|
80
|
+
YAMLStor.add(self.class.name, key, value)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Load YAML object
|
84
|
+
# @param [ String ] key
|
85
|
+
# @return [ Object, nil ]
|
86
|
+
def load_yaml(key, value)
|
87
|
+
YAMLStor.get(self.class.name, key, value)
|
88
|
+
end
|
69
89
|
end
|
70
90
|
end
|
@@ -0,0 +1,299 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'nokogiri'
|
24
|
+
require 'uri'
|
25
|
+
|
26
|
+
require 'news_crawler/url_helper'
|
27
|
+
require 'news_crawler/storage/url_queue'
|
28
|
+
require 'news_crawler/storage/raw_data'
|
29
|
+
require 'news_crawler/crawler_module'
|
30
|
+
require 'news_crawler/nc_logger'
|
31
|
+
|
32
|
+
module NewsCrawler
|
33
|
+
module Processing
|
34
|
+
# Analyse website structure to extract content
|
35
|
+
# Database should only contains raw data from one website.
|
36
|
+
class StructureAnalysis
|
37
|
+
include CrawlerModule
|
38
|
+
include URLHelper
|
39
|
+
|
40
|
+
def initialize
|
41
|
+
@url_stats = {}
|
42
|
+
while (url = next_unprocessed)
|
43
|
+
NCLogger.get_logger.info "[NC::P::SA] Processing #{url}"
|
44
|
+
re = extract_content(url)
|
45
|
+
@url_stats[url] = re
|
46
|
+
save_yaml(url, re)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def extract_content(url)
|
51
|
+
html_doc = RawData.find_by_url(url)
|
52
|
+
result = {}
|
53
|
+
result[:type] == :article
|
54
|
+
|
55
|
+
# Remove tag causing trouble to nokogiri
|
56
|
+
html_doc = remove_tag(html_doc, 'script')
|
57
|
+
html_doc = remove_tag(html_doc, 'iframe')
|
58
|
+
html_doc = remove_tag(html_doc, 'style')
|
59
|
+
|
60
|
+
doc = Nokogiri::HTML.parse(html_doc)
|
61
|
+
longest = find_longest_node(doc)
|
62
|
+
lowest_ancestor, path_to_longest = find_lowest_ancestor_has_id(longest)
|
63
|
+
|
64
|
+
# Heuristic 1
|
65
|
+
# Longest content is a element as id attribute
|
66
|
+
if path_to_longest.length == 2
|
67
|
+
return { :type => :list }
|
68
|
+
end
|
69
|
+
|
70
|
+
parent = path_to_longest[1..-1]
|
71
|
+
parent = parent.reverse
|
72
|
+
xpath_path = parent.join('/')
|
73
|
+
xpath_path = '//' + xpath_path + '//text()'
|
74
|
+
|
75
|
+
guest_type = classify_h2(longest, lowest_ancestor)
|
76
|
+
result = { :type => guest_type }
|
77
|
+
|
78
|
+
if (result[:type] == :article)
|
79
|
+
title_ = lowest_ancestor.css('h1')
|
80
|
+
if title_.count == 1
|
81
|
+
result[:title] = title_.to_a[0].content
|
82
|
+
else
|
83
|
+
# if cann't guest title then assume it isn't an article
|
84
|
+
result[:type] = :list
|
85
|
+
end
|
86
|
+
|
87
|
+
main_content = ''
|
88
|
+
lowest_ancestor.xpath(xpath_path).each do | node |
|
89
|
+
main_content += node.content
|
90
|
+
end
|
91
|
+
|
92
|
+
result[:content] = main_content
|
93
|
+
end
|
94
|
+
|
95
|
+
mark_processed(url)
|
96
|
+
result
|
97
|
+
end
|
98
|
+
|
99
|
+
# Predict type of tree point by root is fragment of article or index page
|
100
|
+
# @param [ Nokogiri::XML::Node ] root
|
101
|
+
# @paran [ Nokogiri::XML::Node ] limit limit node to search backward
|
102
|
+
# @return [ Symbol ] one of :article, :list
|
103
|
+
def classify_h2(root, limit)
|
104
|
+
current = root
|
105
|
+
current = current.parent if current.text?
|
106
|
+
|
107
|
+
depth = 0
|
108
|
+
|
109
|
+
while true
|
110
|
+
expect_hash = hash_node(current, 0)
|
111
|
+
previous = current
|
112
|
+
current = current.parent
|
113
|
+
|
114
|
+
depth += 1
|
115
|
+
lons = {}
|
116
|
+
node_count = 0
|
117
|
+
node_list = [previous]
|
118
|
+
current.children.each do | child |
|
119
|
+
hc = hash_node(child, depth - 1)
|
120
|
+
if hc == expect_hash
|
121
|
+
node_count += 1
|
122
|
+
node_list << child
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
if node_count > 1
|
127
|
+
a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
|
128
|
+
if non_a_tag_len > a_tag_len
|
129
|
+
return :article
|
130
|
+
else
|
131
|
+
return :list
|
132
|
+
end
|
133
|
+
break
|
134
|
+
end
|
135
|
+
|
136
|
+
if current == limit
|
137
|
+
a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
|
138
|
+
if non_a_tag_len > a_tag_len
|
139
|
+
return :article
|
140
|
+
else
|
141
|
+
return :list
|
142
|
+
end
|
143
|
+
break
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
return :list
|
148
|
+
end
|
149
|
+
|
150
|
+
# Count a tag and non-a tag in tree pointed by node
|
151
|
+
# @param [ Nokogiri::XML::Node ] node
|
152
|
+
# @return [ [Fixnum, Fixnum] ] a tag and non-a tag
|
153
|
+
def count_a_and_non_a_tag(node)
|
154
|
+
a_tag_list = node.xpath './/a'
|
155
|
+
a_tag_len = a_tag_list.count # number of a tag
|
156
|
+
|
157
|
+
non_a_tag_list = node.xpath './/text()[not (ancestor::a)]'
|
158
|
+
non_a_tag_len = non_a_tag_list.to_a.inject(0) do | memo, node |
|
159
|
+
if node.content.gsub(/\s+/, '').length > 15
|
160
|
+
memo + 1
|
161
|
+
else
|
162
|
+
memo
|
163
|
+
end
|
164
|
+
end
|
165
|
+
[ a_tag_len, non_a_tag_len ]
|
166
|
+
end
|
167
|
+
|
168
|
+
# Find the lowest node's ancestor has id attribute
|
169
|
+
# @param [ Nokogiri::XML::Node ] node
|
170
|
+
# @return [ Nokogiri::XML::Node ]
|
171
|
+
def find_lowest_ancestor_has_id(node)
|
172
|
+
found_id = false
|
173
|
+
|
174
|
+
closest_ancestor = node
|
175
|
+
|
176
|
+
path_to_closest = []
|
177
|
+
|
178
|
+
while (!found_id)
|
179
|
+
if closest_ancestor.has_attribute?('id')
|
180
|
+
path_to_closest << "#{closest_ancestor.node_name}[@id='#{closest_ancestor.attribute('id')}']"
|
181
|
+
found_id = true
|
182
|
+
else
|
183
|
+
if closest_ancestor.has_attribute?('class')
|
184
|
+
node_class = "@class = '#{closest_ancestor.attribute('class')}'"
|
185
|
+
else
|
186
|
+
node_class = 'not(@class)'
|
187
|
+
end
|
188
|
+
path_to_closest << "#{closest_ancestor.node_name}[#{node_class}]"
|
189
|
+
closest_ancestor = closest_ancestor.parent
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
return [ closest_ancestor, path_to_closest ]
|
194
|
+
end
|
195
|
+
|
196
|
+
# Find longest text node that doesn't have a in ancestors list
|
197
|
+
# @param [ Nokogiri::XML::Node ] doc
|
198
|
+
def find_longest_node(doc)
|
199
|
+
xpath_query = '//*[@id]//text()[not (ancestor::a)]'
|
200
|
+
|
201
|
+
a_l = doc.xpath xpath_query
|
202
|
+
|
203
|
+
longest = nil
|
204
|
+
longest_len = 0
|
205
|
+
|
206
|
+
a_l.each do | en |
|
207
|
+
node_content_wo_space = en.content.gsub(/\s/, '') # trick here
|
208
|
+
if node_content_wo_space.length > longest_len
|
209
|
+
longest_len = node_content_wo_space.length
|
210
|
+
longest = en
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
return longest
|
215
|
+
end
|
216
|
+
|
217
|
+
# Remove unwanted HTML tag
|
218
|
+
# @param [ String ] html_doc HTML document
|
219
|
+
# @param [ String ] tag tag to be removed
|
220
|
+
def remove_tag(html_doc, tag)
|
221
|
+
pattern = Regexp.new("<#{tag}.*?>.*?</#{tag}>", Regexp::MULTILINE)
|
222
|
+
html_doc.gsub(pattern, '')
|
223
|
+
end
|
224
|
+
|
225
|
+
# Return String represents node's name, node's id and node's class
|
226
|
+
# @param [ Nokogiri::XML::Node ] node
|
227
|
+
# @return [ String ]
|
228
|
+
def node_info(node)
|
229
|
+
node_pp = node.node_name
|
230
|
+
node_pp += '#' + node.attribute('id') if node.has_attribute?('id')
|
231
|
+
node_pp += '.' + node.attribute('class') if node.has_attribute?('class')
|
232
|
+
node_pp
|
233
|
+
end
|
234
|
+
|
235
|
+
# Calculate hash of a node by its and children info
|
236
|
+
# @param [ Nokogiri::XML::Node ] node
|
237
|
+
# @param [ Fixnum ] limit limit depth of children (-1 for unlimited)
|
238
|
+
# @return [ String ] Hash of node in base 64 encode
|
239
|
+
def hash_node(node, limit = -1)
|
240
|
+
node_sign = node.node_name
|
241
|
+
node_sign += "##{node['id']}" unless node['id'].nil?
|
242
|
+
node_sign += ".#{node['class']}" unless node['class'].nil?
|
243
|
+
|
244
|
+
hash_sum = node_sign
|
245
|
+
|
246
|
+
if limit != 0
|
247
|
+
child_hash = Set.new
|
248
|
+
node.children.each do | child_node |
|
249
|
+
child_hash.add(hash_node(child_node, limit - 1))
|
250
|
+
end
|
251
|
+
|
252
|
+
child_hash.each do | ch |
|
253
|
+
hash_sum += ch
|
254
|
+
end
|
255
|
+
else
|
256
|
+
|
257
|
+
end
|
258
|
+
|
259
|
+
Digest::SHA2.new.base64digest(hash_sum)
|
260
|
+
end
|
261
|
+
|
262
|
+
|
263
|
+
# Get and analyse url for information
|
264
|
+
def analyse(url)
|
265
|
+
# puts "processing #{url}"
|
266
|
+
html_doc = RawData.find_by_url(url)
|
267
|
+
doc = Nokogiri.HTML(html_doc)
|
268
|
+
inner_url = doc.xpath('//a').collect { | a_el |
|
269
|
+
temp_url = (a_el.attribute 'href').to_s
|
270
|
+
if (!temp_url.nil?) && (temp_url[0] == '/')
|
271
|
+
temp_url = URI.join(url, temp_url).to_s
|
272
|
+
end
|
273
|
+
temp_url
|
274
|
+
}
|
275
|
+
|
276
|
+
inner_url.delete_if { | url_0 |
|
277
|
+
(url_0.nil?) || (url_0.size == 0) || (url_0 == '#') ||
|
278
|
+
(url_0 == 'javascript:;')
|
279
|
+
}
|
280
|
+
|
281
|
+
inner_url.each do | url |
|
282
|
+
@url_stats[url] = (@url_stats[url] || 0) + 1
|
283
|
+
end
|
284
|
+
mark_processed(url)
|
285
|
+
end
|
286
|
+
|
287
|
+
# Check if it is really 'url'
|
288
|
+
# @param [ String ] url
|
289
|
+
# @return [ Boolean ]
|
290
|
+
def is_url?(url)
|
291
|
+
(url.size != 0) && (url != '#') && (url != 'javascript:;')
|
292
|
+
end
|
293
|
+
|
294
|
+
def get_result
|
295
|
+
@url_stats
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'mongo'
|
24
|
+
require 'yaml'
|
25
|
+
require 'simple_config'
|
26
|
+
require 'news_crawler/storage/yaml_stor/yaml_stor_engine'
|
27
|
+
require 'news_crawler/nc_logger'
|
28
|
+
|
29
|
+
|
30
|
+
module NewsCrawler
|
31
|
+
module Storage
|
32
|
+
module YAMLStor
|
33
|
+
# YAML storage implement using MongoDB
|
34
|
+
class MongoStorage < NewsCrawler::Storage::YAMLStor::YAMLStorEngine
|
35
|
+
NAME = 'mongo'
|
36
|
+
|
37
|
+
include Mongo
|
38
|
+
|
39
|
+
def initialize(*opts)
|
40
|
+
config = (SimpleConfig.for :application)
|
41
|
+
client = MongoClient.new(config.mongodb.host, config.mongodb.port)
|
42
|
+
db = client[config.mongodb.db_name]
|
43
|
+
@coll = db[config.prefix + '_' + config.suffix.yaml]
|
44
|
+
# @coll.ensure_index({:key => Mongo::ASCENDING}, {:unique => true})
|
45
|
+
end
|
46
|
+
|
47
|
+
# Add entry to yaml collection, overwrite old data
|
48
|
+
# @param [ String ] module_name
|
49
|
+
# @param [ String ] key
|
50
|
+
# @param [ Object ] value YAML string
|
51
|
+
def add(module_name, key, value)
|
52
|
+
yaml_str = value.to_yaml
|
53
|
+
yaml_str.encode!('utf-8', :invalid => :replace, :undef => :replace)
|
54
|
+
@coll.update({:key => key,
|
55
|
+
:m_name => module_name},
|
56
|
+
{:$set => {:value => yaml_str}},
|
57
|
+
{:upsert => true})
|
58
|
+
end
|
59
|
+
|
60
|
+
# Find document with correspond key
|
61
|
+
# @param [ String ] module_name
|
62
|
+
# @param [ String ] key
|
63
|
+
# @return [ Object, nil ]
|
64
|
+
def get(module_name, key)
|
65
|
+
result = @coll.find_one({:key => key,
|
66
|
+
:m_name => module_name})
|
67
|
+
if (!result.nil?)
|
68
|
+
YAML.load(result['value'])
|
69
|
+
else
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get number of raw data entries
|
75
|
+
def count
|
76
|
+
@coll.count
|
77
|
+
end
|
78
|
+
|
79
|
+
def clear
|
80
|
+
@coll.remove
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
module NewsCrawler
|
24
|
+
module Storage
|
25
|
+
module YAMLStor
|
26
|
+
# Basic class for YAMLStor engine.
|
27
|
+
# Subclass and implement all its method to create new YAMLStor engine,
|
28
|
+
# you should keep methods' singature unchanged
|
29
|
+
class YAMLStorEngine
|
30
|
+
def self.inherited(klass)
|
31
|
+
@engine_list = (@engine_list || []) + [klass]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Get engine list
|
35
|
+
# @return [ Array ] list of url queue engines
|
36
|
+
def self.get_engines
|
37
|
+
@engine_list = @engine_list || []
|
38
|
+
@engine_list.inject({}) do | memo, klass |
|
39
|
+
memo[klass::NAME.intern] = klass
|
40
|
+
memo
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Add entry to raw data collection
|
45
|
+
# @param [ String ] module_name
|
46
|
+
# @param [ String ] key
|
47
|
+
# @param [ Object ] value
|
48
|
+
def add(module_name, key, value)
|
49
|
+
raise NotImplementedError
|
50
|
+
end
|
51
|
+
|
52
|
+
# Get entry to raw data collection
|
53
|
+
# @param [ String ] module_name
|
54
|
+
# @param [ String ] key
|
55
|
+
# @return [ Object, nil ] Value or nil if key isn't found
|
56
|
+
def get(module_name, key)
|
57
|
+
raise NotImplementedError
|
58
|
+
end
|
59
|
+
|
60
|
+
def count
|
61
|
+
raise NotImplementedError
|
62
|
+
end
|
63
|
+
|
64
|
+
def clear
|
65
|
+
raise NotImplementedError
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'simpleconfig'
|
24
|
+
|
25
|
+
#!!!
|
26
|
+
require 'news_crawler/storage/yaml_stor/yaml_stor_engine'
|
27
|
+
require 'news_crawler/storage/yaml_stor/mongo_storage'
|
28
|
+
|
29
|
+
module NewsCrawler
|
30
|
+
module Storage
|
31
|
+
# YAML data storage
|
32
|
+
# You can use it for store processed data or configuration
|
33
|
+
module YAMLStor
|
34
|
+
class << self
|
35
|
+
# Set YAMLStor storage engine
|
36
|
+
# @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
|
37
|
+
# @param [ Hash ] opts options pass to engine
|
38
|
+
# This can be
|
39
|
+
# * `:mongo`, `:mongodb` for MongoDB backend
|
40
|
+
def set_engine(engine, *opts)
|
41
|
+
if engine.respond_to? :intern
|
42
|
+
engine = engine.intern
|
43
|
+
end
|
44
|
+
engine_class = YAMLStorEngine.get_engines[engine]
|
45
|
+
if engine_class
|
46
|
+
@engine = engine_class.new(*opts)
|
47
|
+
else
|
48
|
+
@engine = engine
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Add entry to YAML storage
|
53
|
+
# @param [ String ] module_name
|
54
|
+
# @param [ String ] key
|
55
|
+
# @param [ String ] value object to serialize
|
56
|
+
def add(module_name, key, value)
|
57
|
+
@engine.add(module_name, key, value)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Find document with correspond key
|
61
|
+
# @param [ String ] module_name
|
62
|
+
# @param [ String ] key
|
63
|
+
# @return [ Object, nil ]
|
64
|
+
def get(module_name, key)
|
65
|
+
@engine.get(module_name, key)
|
66
|
+
end
|
67
|
+
|
68
|
+
def count
|
69
|
+
@engine.count
|
70
|
+
end
|
71
|
+
|
72
|
+
def clear
|
73
|
+
@engine.clear
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: news_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4.pre.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hà Quang Dương
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mongo
|
@@ -164,6 +164,7 @@ files:
|
|
164
164
|
- lib/news_crawler/downloader.rb
|
165
165
|
- lib/news_crawler/link_selector/same_domain_selector.rb
|
166
166
|
- lib/news_crawler/nc_logger.rb
|
167
|
+
- lib/news_crawler/processing/structure_analysis.rb
|
167
168
|
- lib/news_crawler/storage/raw_data.rb
|
168
169
|
- lib/news_crawler/storage/raw_data/mongo_storage.rb
|
169
170
|
- lib/news_crawler/storage/raw_data/raw_data_engine.rb
|
@@ -171,6 +172,9 @@ files:
|
|
171
172
|
- lib/news_crawler/storage/url_queue/mongo_storage.rb
|
172
173
|
- lib/news_crawler/storage/url_queue/url_queue_engine.rb
|
173
174
|
- lib/news_crawler/storage/url_queue/url_queue_error.rb
|
175
|
+
- lib/news_crawler/storage/yaml_stor.rb
|
176
|
+
- lib/news_crawler/storage/yaml_stor/mongo_storage.rb
|
177
|
+
- lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb
|
174
178
|
- lib/news_crawler/url_helper.rb
|
175
179
|
- lib/news_crawler/utils/robots_patch.rb
|
176
180
|
- lib/news_crawler/default_config.yml
|
@@ -191,9 +195,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
191
195
|
version: 2.0.0
|
192
196
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
193
197
|
requirements:
|
194
|
-
- - '
|
198
|
+
- - '>'
|
195
199
|
- !ruby/object:Gem::Version
|
196
|
-
version:
|
200
|
+
version: 1.3.1
|
197
201
|
requirements: []
|
198
202
|
rubyforge_project:
|
199
203
|
rubygems_version: 2.0.3
|