news_crawler 0.0.3 → 0.0.4.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/news_crawler/crawler_module.rb +21 -1
- data/lib/news_crawler/default_config.yml +3 -2
- data/lib/news_crawler/processing/structure_analysis.rb +299 -0
- data/lib/news_crawler/storage/yaml_stor/mongo_storage.rb +85 -0
- data/lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb +70 -0
- data/lib/news_crawler/storage/yaml_stor.rb +78 -0
- metadata +8 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37f387711eea761bdcaa5cf16f30eb295b60652e
|
4
|
+
data.tar.gz: 127ab77d72f429a3bacf46a24367331eaadf471f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 278f80bc1f4eec78a2536ee74edebfe94b26b45a15c9577bedb7214325ad4cdeda6e64eaae002e7f5c48480fe4eaefb413e4e00bef518915de1fb8cfbb08321d
|
7
|
+
data.tar.gz: c57baf50222284a38769636c2cb4cc57e075bf8c008f32765c9dc92a2992afe6eb8d5eb32b6d5e04bf5695167c55be40c74eed4b981462d2a9a27959cc852c3b
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
#--
|
2
3
|
# NewsCrawler - a website crawler
|
3
4
|
#
|
@@ -20,7 +21,8 @@
|
|
20
21
|
#++
|
21
22
|
|
22
23
|
require 'news_crawler/storage/url_queue'
|
23
|
-
require '
|
24
|
+
require 'news_crawler/storage/yaml_stor'
|
25
|
+
|
24
26
|
|
25
27
|
module NewsCrawler
|
26
28
|
# Include this to get basic module methods
|
@@ -66,5 +68,23 @@ module NewsCrawler
|
|
66
68
|
def next_unprocessed(max_depth = -1)
|
67
69
|
URLQueue.next_unprocessed(self.class.name, max_depth)
|
68
70
|
end
|
71
|
+
|
72
|
+
def mark_all_as_unprocessed
|
73
|
+
URLQueue.mark_all(self.class.name, URLQueue::UNPROCESSED)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Serialize object to YAML and save it (overwrite if key existed)
|
77
|
+
# @param [ String ] key
|
78
|
+
# @param [ Object ] value
|
79
|
+
def save_yaml(key, value)
|
80
|
+
YAMLStor.add(self.class.name, key, value)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Load YAML object
|
84
|
+
# @param [ String ] key
|
85
|
+
# @return [ Object, nil ]
|
86
|
+
def load_yaml(key, value)
|
87
|
+
YAMLStor.get(self.class.name, key, value)
|
88
|
+
end
|
69
89
|
end
|
70
90
|
end
|
@@ -0,0 +1,299 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'nokogiri'
|
24
|
+
require 'uri'
|
25
|
+
|
26
|
+
require 'news_crawler/url_helper'
|
27
|
+
require 'news_crawler/storage/url_queue'
|
28
|
+
require 'news_crawler/storage/raw_data'
|
29
|
+
require 'news_crawler/crawler_module'
|
30
|
+
require 'news_crawler/nc_logger'
|
31
|
+
|
32
|
+
module NewsCrawler
|
33
|
+
module Processing
|
34
|
+
# Analyse website structure to extract content
|
35
|
+
# Database should only contains raw data from one website.
|
36
|
+
class StructureAnalysis
|
37
|
+
include CrawlerModule
|
38
|
+
include URLHelper
|
39
|
+
|
40
|
+
def initialize
|
41
|
+
@url_stats = {}
|
42
|
+
while (url = next_unprocessed)
|
43
|
+
NCLogger.get_logger.info "[NC::P::SA] Processing #{url}"
|
44
|
+
re = extract_content(url)
|
45
|
+
@url_stats[url] = re
|
46
|
+
save_yaml(url, re)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def extract_content(url)
|
51
|
+
html_doc = RawData.find_by_url(url)
|
52
|
+
result = {}
|
53
|
+
result[:type] == :article
|
54
|
+
|
55
|
+
# Remove tag causing trouble to nokogiri
|
56
|
+
html_doc = remove_tag(html_doc, 'script')
|
57
|
+
html_doc = remove_tag(html_doc, 'iframe')
|
58
|
+
html_doc = remove_tag(html_doc, 'style')
|
59
|
+
|
60
|
+
doc = Nokogiri::HTML.parse(html_doc)
|
61
|
+
longest = find_longest_node(doc)
|
62
|
+
lowest_ancestor, path_to_longest = find_lowest_ancestor_has_id(longest)
|
63
|
+
|
64
|
+
# Heuristic 1
|
65
|
+
# Longest content is a element as id attribute
|
66
|
+
if path_to_longest.length == 2
|
67
|
+
return { :type => :list }
|
68
|
+
end
|
69
|
+
|
70
|
+
parent = path_to_longest[1..-1]
|
71
|
+
parent = parent.reverse
|
72
|
+
xpath_path = parent.join('/')
|
73
|
+
xpath_path = '//' + xpath_path + '//text()'
|
74
|
+
|
75
|
+
guest_type = classify_h2(longest, lowest_ancestor)
|
76
|
+
result = { :type => guest_type }
|
77
|
+
|
78
|
+
if (result[:type] == :article)
|
79
|
+
title_ = lowest_ancestor.css('h1')
|
80
|
+
if title_.count == 1
|
81
|
+
result[:title] = title_.to_a[0].content
|
82
|
+
else
|
83
|
+
# if cann't guest title then assume it isn't an article
|
84
|
+
result[:type] = :list
|
85
|
+
end
|
86
|
+
|
87
|
+
main_content = ''
|
88
|
+
lowest_ancestor.xpath(xpath_path).each do | node |
|
89
|
+
main_content += node.content
|
90
|
+
end
|
91
|
+
|
92
|
+
result[:content] = main_content
|
93
|
+
end
|
94
|
+
|
95
|
+
mark_processed(url)
|
96
|
+
result
|
97
|
+
end
|
98
|
+
|
99
|
+
# Predict type of tree point by root is fragment of article or index page
|
100
|
+
# @param [ Nokogiri::XML::Node ] root
|
101
|
+
# @paran [ Nokogiri::XML::Node ] limit limit node to search backward
|
102
|
+
# @return [ Symbol ] one of :article, :list
|
103
|
+
def classify_h2(root, limit)
|
104
|
+
current = root
|
105
|
+
current = current.parent if current.text?
|
106
|
+
|
107
|
+
depth = 0
|
108
|
+
|
109
|
+
while true
|
110
|
+
expect_hash = hash_node(current, 0)
|
111
|
+
previous = current
|
112
|
+
current = current.parent
|
113
|
+
|
114
|
+
depth += 1
|
115
|
+
lons = {}
|
116
|
+
node_count = 0
|
117
|
+
node_list = [previous]
|
118
|
+
current.children.each do | child |
|
119
|
+
hc = hash_node(child, depth - 1)
|
120
|
+
if hc == expect_hash
|
121
|
+
node_count += 1
|
122
|
+
node_list << child
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
if node_count > 1
|
127
|
+
a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
|
128
|
+
if non_a_tag_len > a_tag_len
|
129
|
+
return :article
|
130
|
+
else
|
131
|
+
return :list
|
132
|
+
end
|
133
|
+
break
|
134
|
+
end
|
135
|
+
|
136
|
+
if current == limit
|
137
|
+
a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
|
138
|
+
if non_a_tag_len > a_tag_len
|
139
|
+
return :article
|
140
|
+
else
|
141
|
+
return :list
|
142
|
+
end
|
143
|
+
break
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
return :list
|
148
|
+
end
|
149
|
+
|
150
|
+
# Count a tag and non-a tag in tree pointed by node
|
151
|
+
# @param [ Nokogiri::XML::Node ] node
|
152
|
+
# @return [ [Fixnum, Fixnum] ] a tag and non-a tag
|
153
|
+
def count_a_and_non_a_tag(node)
|
154
|
+
a_tag_list = node.xpath './/a'
|
155
|
+
a_tag_len = a_tag_list.count # number of a tag
|
156
|
+
|
157
|
+
non_a_tag_list = node.xpath './/text()[not (ancestor::a)]'
|
158
|
+
non_a_tag_len = non_a_tag_list.to_a.inject(0) do | memo, node |
|
159
|
+
if node.content.gsub(/\s+/, '').length > 15
|
160
|
+
memo + 1
|
161
|
+
else
|
162
|
+
memo
|
163
|
+
end
|
164
|
+
end
|
165
|
+
[ a_tag_len, non_a_tag_len ]
|
166
|
+
end
|
167
|
+
|
168
|
+
# Find the lowest node's ancestor has id attribute
|
169
|
+
# @param [ Nokogiri::XML::Node ] node
|
170
|
+
# @return [ Nokogiri::XML::Node ]
|
171
|
+
def find_lowest_ancestor_has_id(node)
|
172
|
+
found_id = false
|
173
|
+
|
174
|
+
closest_ancestor = node
|
175
|
+
|
176
|
+
path_to_closest = []
|
177
|
+
|
178
|
+
while (!found_id)
|
179
|
+
if closest_ancestor.has_attribute?('id')
|
180
|
+
path_to_closest << "#{closest_ancestor.node_name}[@id='#{closest_ancestor.attribute('id')}']"
|
181
|
+
found_id = true
|
182
|
+
else
|
183
|
+
if closest_ancestor.has_attribute?('class')
|
184
|
+
node_class = "@class = '#{closest_ancestor.attribute('class')}'"
|
185
|
+
else
|
186
|
+
node_class = 'not(@class)'
|
187
|
+
end
|
188
|
+
path_to_closest << "#{closest_ancestor.node_name}[#{node_class}]"
|
189
|
+
closest_ancestor = closest_ancestor.parent
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
return [ closest_ancestor, path_to_closest ]
|
194
|
+
end
|
195
|
+
|
196
|
+
# Find longest text node that doesn't have a in ancestors list
|
197
|
+
# @param [ Nokogiri::XML::Node ] doc
|
198
|
+
def find_longest_node(doc)
|
199
|
+
xpath_query = '//*[@id]//text()[not (ancestor::a)]'
|
200
|
+
|
201
|
+
a_l = doc.xpath xpath_query
|
202
|
+
|
203
|
+
longest = nil
|
204
|
+
longest_len = 0
|
205
|
+
|
206
|
+
a_l.each do | en |
|
207
|
+
node_content_wo_space = en.content.gsub(/\s/, '') # trick here
|
208
|
+
if node_content_wo_space.length > longest_len
|
209
|
+
longest_len = node_content_wo_space.length
|
210
|
+
longest = en
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
return longest
|
215
|
+
end
|
216
|
+
|
217
|
+
# Remove unwanted HTML tag
|
218
|
+
# @param [ String ] html_doc HTML document
|
219
|
+
# @param [ String ] tag tag to be removed
|
220
|
+
def remove_tag(html_doc, tag)
|
221
|
+
pattern = Regexp.new("<#{tag}.*?>.*?</#{tag}>", Regexp::MULTILINE)
|
222
|
+
html_doc.gsub(pattern, '')
|
223
|
+
end
|
224
|
+
|
225
|
+
# Return String represents node's name, node's id and node's class
|
226
|
+
# @param [ Nokogiri::XML::Node ] node
|
227
|
+
# @return [ String ]
|
228
|
+
def node_info(node)
|
229
|
+
node_pp = node.node_name
|
230
|
+
node_pp += '#' + node.attribute('id') if node.has_attribute?('id')
|
231
|
+
node_pp += '.' + node.attribute('class') if node.has_attribute?('class')
|
232
|
+
node_pp
|
233
|
+
end
|
234
|
+
|
235
|
+
# Calculate hash of a node by its and children info
|
236
|
+
# @param [ Nokogiri::XML::Node ] node
|
237
|
+
# @param [ Fixnum ] limit limit depth of children (-1 for unlimited)
|
238
|
+
# @return [ String ] Hash of node in base 64 encode
|
239
|
+
def hash_node(node, limit = -1)
|
240
|
+
node_sign = node.node_name
|
241
|
+
node_sign += "##{node['id']}" unless node['id'].nil?
|
242
|
+
node_sign += ".#{node['class']}" unless node['class'].nil?
|
243
|
+
|
244
|
+
hash_sum = node_sign
|
245
|
+
|
246
|
+
if limit != 0
|
247
|
+
child_hash = Set.new
|
248
|
+
node.children.each do | child_node |
|
249
|
+
child_hash.add(hash_node(child_node, limit - 1))
|
250
|
+
end
|
251
|
+
|
252
|
+
child_hash.each do | ch |
|
253
|
+
hash_sum += ch
|
254
|
+
end
|
255
|
+
else
|
256
|
+
|
257
|
+
end
|
258
|
+
|
259
|
+
Digest::SHA2.new.base64digest(hash_sum)
|
260
|
+
end
|
261
|
+
|
262
|
+
|
263
|
+
# Get and analyse url for information
|
264
|
+
def analyse(url)
|
265
|
+
# puts "processing #{url}"
|
266
|
+
html_doc = RawData.find_by_url(url)
|
267
|
+
doc = Nokogiri.HTML(html_doc)
|
268
|
+
inner_url = doc.xpath('//a').collect { | a_el |
|
269
|
+
temp_url = (a_el.attribute 'href').to_s
|
270
|
+
if (!temp_url.nil?) && (temp_url[0] == '/')
|
271
|
+
temp_url = URI.join(url, temp_url).to_s
|
272
|
+
end
|
273
|
+
temp_url
|
274
|
+
}
|
275
|
+
|
276
|
+
inner_url.delete_if { | url_0 |
|
277
|
+
(url_0.nil?) || (url_0.size == 0) || (url_0 == '#') ||
|
278
|
+
(url_0 == 'javascript:;')
|
279
|
+
}
|
280
|
+
|
281
|
+
inner_url.each do | url |
|
282
|
+
@url_stats[url] = (@url_stats[url] || 0) + 1
|
283
|
+
end
|
284
|
+
mark_processed(url)
|
285
|
+
end
|
286
|
+
|
287
|
+
# Check if it is really 'url'
|
288
|
+
# @param [ String ] url
|
289
|
+
# @return [ Boolean ]
|
290
|
+
def is_url?(url)
|
291
|
+
(url.size != 0) && (url != '#') && (url != 'javascript:;')
|
292
|
+
end
|
293
|
+
|
294
|
+
def get_result
|
295
|
+
@url_stats
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'mongo'
|
24
|
+
require 'yaml'
|
25
|
+
require 'simple_config'
|
26
|
+
require 'news_crawler/storage/yaml_stor/yaml_stor_engine'
|
27
|
+
require 'news_crawler/nc_logger'
|
28
|
+
|
29
|
+
|
30
|
+
module NewsCrawler
|
31
|
+
module Storage
|
32
|
+
module YAMLStor
|
33
|
+
# YAML storage implement using MongoDB
|
34
|
+
class MongoStorage < NewsCrawler::Storage::YAMLStor::YAMLStorEngine
|
35
|
+
NAME = 'mongo'
|
36
|
+
|
37
|
+
include Mongo
|
38
|
+
|
39
|
+
def initialize(*opts)
|
40
|
+
config = (SimpleConfig.for :application)
|
41
|
+
client = MongoClient.new(config.mongodb.host, config.mongodb.port)
|
42
|
+
db = client[config.mongodb.db_name]
|
43
|
+
@coll = db[config.prefix + '_' + config.suffix.yaml]
|
44
|
+
# @coll.ensure_index({:key => Mongo::ASCENDING}, {:unique => true})
|
45
|
+
end
|
46
|
+
|
47
|
+
# Add entry to yaml collection, overwrite old data
|
48
|
+
# @param [ String ] module_name
|
49
|
+
# @param [ String ] key
|
50
|
+
# @param [ Object ] value YAML string
|
51
|
+
def add(module_name, key, value)
|
52
|
+
yaml_str = value.to_yaml
|
53
|
+
yaml_str.encode!('utf-8', :invalid => :replace, :undef => :replace)
|
54
|
+
@coll.update({:key => key,
|
55
|
+
:m_name => module_name},
|
56
|
+
{:$set => {:value => yaml_str}},
|
57
|
+
{:upsert => true})
|
58
|
+
end
|
59
|
+
|
60
|
+
# Find document with correspond key
|
61
|
+
# @param [ String ] module_name
|
62
|
+
# @param [ String ] key
|
63
|
+
# @return [ Object, nil ]
|
64
|
+
def get(module_name, key)
|
65
|
+
result = @coll.find_one({:key => key,
|
66
|
+
:m_name => module_name})
|
67
|
+
if (!result.nil?)
|
68
|
+
YAML.load(result['value'])
|
69
|
+
else
|
70
|
+
nil
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Get number of raw data entries
|
75
|
+
def count
|
76
|
+
@coll.count
|
77
|
+
end
|
78
|
+
|
79
|
+
def clear
|
80
|
+
@coll.remove
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
module NewsCrawler
|
24
|
+
module Storage
|
25
|
+
module YAMLStor
|
26
|
+
# Basic class for YAMLStor engine.
|
27
|
+
# Subclass and implement all its method to create new YAMLStor engine,
|
28
|
+
# you should keep methods' singature unchanged
|
29
|
+
class YAMLStorEngine
|
30
|
+
def self.inherited(klass)
|
31
|
+
@engine_list = (@engine_list || []) + [klass]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Get engine list
|
35
|
+
# @return [ Array ] list of url queue engines
|
36
|
+
def self.get_engines
|
37
|
+
@engine_list = @engine_list || []
|
38
|
+
@engine_list.inject({}) do | memo, klass |
|
39
|
+
memo[klass::NAME.intern] = klass
|
40
|
+
memo
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Add entry to raw data collection
|
45
|
+
# @param [ String ] module_name
|
46
|
+
# @param [ String ] key
|
47
|
+
# @param [ Object ] value
|
48
|
+
def add(module_name, key, value)
|
49
|
+
raise NotImplementedError
|
50
|
+
end
|
51
|
+
|
52
|
+
# Get entry to raw data collection
|
53
|
+
# @param [ String ] module_name
|
54
|
+
# @param [ String ] key
|
55
|
+
# @return [ Object, nil ] Value or nil if key isn't found
|
56
|
+
def get(module_name, key)
|
57
|
+
raise NotImplementedError
|
58
|
+
end
|
59
|
+
|
60
|
+
def count
|
61
|
+
raise NotImplementedError
|
62
|
+
end
|
63
|
+
|
64
|
+
def clear
|
65
|
+
raise NotImplementedError
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#--
|
3
|
+
# NewsCrawler - a website crawler
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
|
6
|
+
#
|
7
|
+
# This file is part of NewsCrawler.
|
8
|
+
#
|
9
|
+
# NewsCrawler is free software: you can redistribute it and/or modify
|
10
|
+
# it under the terms of the GNU General Public License as published by
|
11
|
+
# the Free Software Foundation, either version 3 of the License, or
|
12
|
+
# (at your option) any later version.
|
13
|
+
#
|
14
|
+
# NewsCrawler is distributed in the hope that it will be useful,
|
15
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
+
# GNU General Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License
|
20
|
+
# along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
|
21
|
+
#++
|
22
|
+
|
23
|
+
require 'simpleconfig'
|
24
|
+
|
25
|
+
#!!!
|
26
|
+
require 'news_crawler/storage/yaml_stor/yaml_stor_engine'
|
27
|
+
require 'news_crawler/storage/yaml_stor/mongo_storage'
|
28
|
+
|
29
|
+
module NewsCrawler
|
30
|
+
module Storage
|
31
|
+
# YAML data storage
|
32
|
+
# You can use it for store processed data or configuration
|
33
|
+
module YAMLStor
|
34
|
+
class << self
|
35
|
+
# Set YAMLStor storage engine
|
36
|
+
# @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
|
37
|
+
# @param [ Hash ] opts options pass to engine
|
38
|
+
# This can be
|
39
|
+
# * `:mongo`, `:mongodb` for MongoDB backend
|
40
|
+
def set_engine(engine, *opts)
|
41
|
+
if engine.respond_to? :intern
|
42
|
+
engine = engine.intern
|
43
|
+
end
|
44
|
+
engine_class = YAMLStorEngine.get_engines[engine]
|
45
|
+
if engine_class
|
46
|
+
@engine = engine_class.new(*opts)
|
47
|
+
else
|
48
|
+
@engine = engine
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# Add entry to YAML storage
|
53
|
+
# @param [ String ] module_name
|
54
|
+
# @param [ String ] key
|
55
|
+
# @param [ String ] value object to serialize
|
56
|
+
def add(module_name, key, value)
|
57
|
+
@engine.add(module_name, key, value)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Find document with correspond key
|
61
|
+
# @param [ String ] module_name
|
62
|
+
# @param [ String ] key
|
63
|
+
# @return [ Object, nil ]
|
64
|
+
def get(module_name, key)
|
65
|
+
@engine.get(module_name, key)
|
66
|
+
end
|
67
|
+
|
68
|
+
def count
|
69
|
+
@engine.count
|
70
|
+
end
|
71
|
+
|
72
|
+
def clear
|
73
|
+
@engine.clear
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: news_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4.pre.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hà Quang Dương
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mongo
|
@@ -164,6 +164,7 @@ files:
|
|
164
164
|
- lib/news_crawler/downloader.rb
|
165
165
|
- lib/news_crawler/link_selector/same_domain_selector.rb
|
166
166
|
- lib/news_crawler/nc_logger.rb
|
167
|
+
- lib/news_crawler/processing/structure_analysis.rb
|
167
168
|
- lib/news_crawler/storage/raw_data.rb
|
168
169
|
- lib/news_crawler/storage/raw_data/mongo_storage.rb
|
169
170
|
- lib/news_crawler/storage/raw_data/raw_data_engine.rb
|
@@ -171,6 +172,9 @@ files:
|
|
171
172
|
- lib/news_crawler/storage/url_queue/mongo_storage.rb
|
172
173
|
- lib/news_crawler/storage/url_queue/url_queue_engine.rb
|
173
174
|
- lib/news_crawler/storage/url_queue/url_queue_error.rb
|
175
|
+
- lib/news_crawler/storage/yaml_stor.rb
|
176
|
+
- lib/news_crawler/storage/yaml_stor/mongo_storage.rb
|
177
|
+
- lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb
|
174
178
|
- lib/news_crawler/url_helper.rb
|
175
179
|
- lib/news_crawler/utils/robots_patch.rb
|
176
180
|
- lib/news_crawler/default_config.yml
|
@@ -191,9 +195,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
191
195
|
version: 2.0.0
|
192
196
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
193
197
|
requirements:
|
194
|
-
- - '
|
198
|
+
- - '>'
|
195
199
|
- !ruby/object:Gem::Version
|
196
|
-
version:
|
200
|
+
version: 1.3.1
|
197
201
|
requirements: []
|
198
202
|
rubyforge_project:
|
199
203
|
rubygems_version: 2.0.3
|