news_crawler 0.0.3 → 0.0.4.pre.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 50f3793b5c9e31eeaba0d7650ddf20d451de16c2
4
- data.tar.gz: 5796887e83aa9912b02fa67f69bb1b1f5f34ecec
3
+ metadata.gz: 37f387711eea761bdcaa5cf16f30eb295b60652e
4
+ data.tar.gz: 127ab77d72f429a3bacf46a24367331eaadf471f
5
5
  SHA512:
6
- metadata.gz: 7d97e8b6630aebad685d9c35b848f575eb293e691a3e42e1cb4c86bee1920df27893fdb51d075dad92dbd029a5eae25e26ee1a5ab8ab6f51a31078fbef479caf
7
- data.tar.gz: 08486762b1261bd0a5b950e63084a216bfd2a710b15c54bceec1d604dcfe77088da2b9265f9a99093281009427d2eb7dd5766592513b54255f75dcf28c16bc97
6
+ metadata.gz: 278f80bc1f4eec78a2536ee74edebfe94b26b45a15c9577bedb7214325ad4cdeda6e64eaae002e7f5c48480fe4eaefb413e4e00bef518915de1fb8cfbb08321d
7
+ data.tar.gz: c57baf50222284a38769636c2cb4cc57e075bf8c008f32765c9dc92a2992afe6eb8d5eb32b6d5e04bf5695167c55be40c74eed4b981462d2a9a27959cc852c3b
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  #--
2
3
  # NewsCrawler - a website crawler
3
4
  #
@@ -20,7 +21,8 @@
20
21
  #++
21
22
 
22
23
  require 'news_crawler/storage/url_queue'
23
- require 'thread'
24
+ require 'news_crawler/storage/yaml_stor'
25
+
24
26
 
25
27
  module NewsCrawler
26
28
  # Include this to get basic module methods
@@ -66,5 +68,23 @@ module NewsCrawler
66
68
  def next_unprocessed(max_depth = -1)
67
69
  URLQueue.next_unprocessed(self.class.name, max_depth)
68
70
  end
71
+
72
+ def mark_all_as_unprocessed
73
+ URLQueue.mark_all(self.class.name, URLQueue::UNPROCESSED)
74
+ end
75
+
76
+ # Serialize object to YAML and save it (overwrite if key existed)
77
+ # @param [ String ] key
78
+ # @param [ Object ] value
79
+ def save_yaml(key, value)
80
+ YAMLStor.add(self.class.name, key, value)
81
+ end
82
+
83
+ # Load YAML object
84
+ # @param [ String ] key
85
+ # @return [ Object, nil ]
86
+ def load_yaml(key, value)
87
+ YAMLStor.get(self.class.name, key, value)
88
+ end
69
89
  end
70
90
  end
@@ -1,4 +1,4 @@
1
- db:
1
+ :db:
2
2
  :engine: :mongo
3
3
 
4
4
  :mongodb:
@@ -9,5 +9,6 @@ db:
9
9
  :suffix:
10
10
  :raw_data: raw_data
11
11
  :url_queue: url_queue
12
+ :yaml: yaml
12
13
 
13
- prefix: ''
14
+ :prefix: ''
@@ -0,0 +1,299 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'nokogiri'
24
+ require 'uri'
25
+
26
+ require 'news_crawler/url_helper'
27
+ require 'news_crawler/storage/url_queue'
28
+ require 'news_crawler/storage/raw_data'
29
+ require 'news_crawler/crawler_module'
30
+ require 'news_crawler/nc_logger'
31
+
32
+ module NewsCrawler
33
+ module Processing
34
+ # Analyse website structure to extract content
35
+ # Database should only contains raw data from one website.
36
+ class StructureAnalysis
37
+ include CrawlerModule
38
+ include URLHelper
39
+
40
+ def initialize
41
+ @url_stats = {}
42
+ while (url = next_unprocessed)
43
+ NCLogger.get_logger.info "[NC::P::SA] Processing #{url}"
44
+ re = extract_content(url)
45
+ @url_stats[url] = re
46
+ save_yaml(url, re)
47
+ end
48
+ end
49
+
50
+ def extract_content(url)
51
+ html_doc = RawData.find_by_url(url)
52
+ result = {}
53
+ result[:type] == :article
54
+
55
+ # Remove tag causing trouble to nokogiri
56
+ html_doc = remove_tag(html_doc, 'script')
57
+ html_doc = remove_tag(html_doc, 'iframe')
58
+ html_doc = remove_tag(html_doc, 'style')
59
+
60
+ doc = Nokogiri::HTML.parse(html_doc)
61
+ longest = find_longest_node(doc)
62
+ lowest_ancestor, path_to_longest = find_lowest_ancestor_has_id(longest)
63
+
64
+ # Heuristic 1
65
+ # Longest content is a element as id attribute
66
+ if path_to_longest.length == 2
67
+ return { :type => :list }
68
+ end
69
+
70
+ parent = path_to_longest[1..-1]
71
+ parent = parent.reverse
72
+ xpath_path = parent.join('/')
73
+ xpath_path = '//' + xpath_path + '//text()'
74
+
75
+ guest_type = classify_h2(longest, lowest_ancestor)
76
+ result = { :type => guest_type }
77
+
78
+ if (result[:type] == :article)
79
+ title_ = lowest_ancestor.css('h1')
80
+ if title_.count == 1
81
+ result[:title] = title_.to_a[0].content
82
+ else
83
+ # if cann't guest title then assume it isn't an article
84
+ result[:type] = :list
85
+ end
86
+
87
+ main_content = ''
88
+ lowest_ancestor.xpath(xpath_path).each do | node |
89
+ main_content += node.content
90
+ end
91
+
92
+ result[:content] = main_content
93
+ end
94
+
95
+ mark_processed(url)
96
+ result
97
+ end
98
+
99
+ # Predict type of tree point by root is fragment of article or index page
100
+ # @param [ Nokogiri::XML::Node ] root
101
+ # @paran [ Nokogiri::XML::Node ] limit limit node to search backward
102
+ # @return [ Symbol ] one of :article, :list
103
+ def classify_h2(root, limit)
104
+ current = root
105
+ current = current.parent if current.text?
106
+
107
+ depth = 0
108
+
109
+ while true
110
+ expect_hash = hash_node(current, 0)
111
+ previous = current
112
+ current = current.parent
113
+
114
+ depth += 1
115
+ lons = {}
116
+ node_count = 0
117
+ node_list = [previous]
118
+ current.children.each do | child |
119
+ hc = hash_node(child, depth - 1)
120
+ if hc == expect_hash
121
+ node_count += 1
122
+ node_list << child
123
+ end
124
+ end
125
+
126
+ if node_count > 1
127
+ a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
128
+ if non_a_tag_len > a_tag_len
129
+ return :article
130
+ else
131
+ return :list
132
+ end
133
+ break
134
+ end
135
+
136
+ if current == limit
137
+ a_tag_len, non_a_tag_len = count_a_and_non_a_tag(current)
138
+ if non_a_tag_len > a_tag_len
139
+ return :article
140
+ else
141
+ return :list
142
+ end
143
+ break
144
+ end
145
+ end
146
+
147
+ return :list
148
+ end
149
+
150
+ # Count a tag and non-a tag in tree pointed by node
151
+ # @param [ Nokogiri::XML::Node ] node
152
+ # @return [ [Fixnum, Fixnum] ] a tag and non-a tag
153
+ def count_a_and_non_a_tag(node)
154
+ a_tag_list = node.xpath './/a'
155
+ a_tag_len = a_tag_list.count # number of a tag
156
+
157
+ non_a_tag_list = node.xpath './/text()[not (ancestor::a)]'
158
+ non_a_tag_len = non_a_tag_list.to_a.inject(0) do | memo, node |
159
+ if node.content.gsub(/\s+/, '').length > 15
160
+ memo + 1
161
+ else
162
+ memo
163
+ end
164
+ end
165
+ [ a_tag_len, non_a_tag_len ]
166
+ end
167
+
168
+ # Find the lowest node's ancestor has id attribute
169
+ # @param [ Nokogiri::XML::Node ] node
170
+ # @return [ Nokogiri::XML::Node ]
171
+ def find_lowest_ancestor_has_id(node)
172
+ found_id = false
173
+
174
+ closest_ancestor = node
175
+
176
+ path_to_closest = []
177
+
178
+ while (!found_id)
179
+ if closest_ancestor.has_attribute?('id')
180
+ path_to_closest << "#{closest_ancestor.node_name}[@id='#{closest_ancestor.attribute('id')}']"
181
+ found_id = true
182
+ else
183
+ if closest_ancestor.has_attribute?('class')
184
+ node_class = "@class = '#{closest_ancestor.attribute('class')}'"
185
+ else
186
+ node_class = 'not(@class)'
187
+ end
188
+ path_to_closest << "#{closest_ancestor.node_name}[#{node_class}]"
189
+ closest_ancestor = closest_ancestor.parent
190
+ end
191
+ end
192
+
193
+ return [ closest_ancestor, path_to_closest ]
194
+ end
195
+
196
+ # Find longest text node that doesn't have a in ancestors list
197
+ # @param [ Nokogiri::XML::Node ] doc
198
+ def find_longest_node(doc)
199
+ xpath_query = '//*[@id]//text()[not (ancestor::a)]'
200
+
201
+ a_l = doc.xpath xpath_query
202
+
203
+ longest = nil
204
+ longest_len = 0
205
+
206
+ a_l.each do | en |
207
+ node_content_wo_space = en.content.gsub(/\s/, '') # trick here
208
+ if node_content_wo_space.length > longest_len
209
+ longest_len = node_content_wo_space.length
210
+ longest = en
211
+ end
212
+ end
213
+
214
+ return longest
215
+ end
216
+
217
+ # Remove unwanted HTML tag
218
+ # @param [ String ] html_doc HTML document
219
+ # @param [ String ] tag tag to be removed
220
+ def remove_tag(html_doc, tag)
221
+ pattern = Regexp.new("<#{tag}.*?>.*?</#{tag}>", Regexp::MULTILINE)
222
+ html_doc.gsub(pattern, '')
223
+ end
224
+
225
+ # Return String represents node's name, node's id and node's class
226
+ # @param [ Nokogiri::XML::Node ] node
227
+ # @return [ String ]
228
+ def node_info(node)
229
+ node_pp = node.node_name
230
+ node_pp += '#' + node.attribute('id') if node.has_attribute?('id')
231
+ node_pp += '.' + node.attribute('class') if node.has_attribute?('class')
232
+ node_pp
233
+ end
234
+
235
+ # Calculate hash of a node by its and children info
236
+ # @param [ Nokogiri::XML::Node ] node
237
+ # @param [ Fixnum ] limit limit depth of children (-1 for unlimited)
238
+ # @return [ String ] Hash of node in base 64 encode
239
+ def hash_node(node, limit = -1)
240
+ node_sign = node.node_name
241
+ node_sign += "##{node['id']}" unless node['id'].nil?
242
+ node_sign += ".#{node['class']}" unless node['class'].nil?
243
+
244
+ hash_sum = node_sign
245
+
246
+ if limit != 0
247
+ child_hash = Set.new
248
+ node.children.each do | child_node |
249
+ child_hash.add(hash_node(child_node, limit - 1))
250
+ end
251
+
252
+ child_hash.each do | ch |
253
+ hash_sum += ch
254
+ end
255
+ else
256
+
257
+ end
258
+
259
+ Digest::SHA2.new.base64digest(hash_sum)
260
+ end
261
+
262
+
263
+ # Get and analyse url for information
264
+ def analyse(url)
265
+ # puts "processing #{url}"
266
+ html_doc = RawData.find_by_url(url)
267
+ doc = Nokogiri.HTML(html_doc)
268
+ inner_url = doc.xpath('//a').collect { | a_el |
269
+ temp_url = (a_el.attribute 'href').to_s
270
+ if (!temp_url.nil?) && (temp_url[0] == '/')
271
+ temp_url = URI.join(url, temp_url).to_s
272
+ end
273
+ temp_url
274
+ }
275
+
276
+ inner_url.delete_if { | url_0 |
277
+ (url_0.nil?) || (url_0.size == 0) || (url_0 == '#') ||
278
+ (url_0 == 'javascript:;')
279
+ }
280
+
281
+ inner_url.each do | url |
282
+ @url_stats[url] = (@url_stats[url] || 0) + 1
283
+ end
284
+ mark_processed(url)
285
+ end
286
+
287
+ # Check if it is really 'url'
288
+ # @param [ String ] url
289
+ # @return [ Boolean ]
290
+ def is_url?(url)
291
+ (url.size != 0) && (url != '#') && (url != 'javascript:;')
292
+ end
293
+
294
+ def get_result
295
+ @url_stats
296
+ end
297
+ end
298
+ end
299
+ end
@@ -0,0 +1,85 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'mongo'
24
+ require 'yaml'
25
+ require 'simple_config'
26
+ require 'news_crawler/storage/yaml_stor/yaml_stor_engine'
27
+ require 'news_crawler/nc_logger'
28
+
29
+
30
+ module NewsCrawler
31
+ module Storage
32
+ module YAMLStor
33
+ # YAML storage implement using MongoDB
34
+ class MongoStorage < NewsCrawler::Storage::YAMLStor::YAMLStorEngine
35
+ NAME = 'mongo'
36
+
37
+ include Mongo
38
+
39
+ def initialize(*opts)
40
+ config = (SimpleConfig.for :application)
41
+ client = MongoClient.new(config.mongodb.host, config.mongodb.port)
42
+ db = client[config.mongodb.db_name]
43
+ @coll = db[config.prefix + '_' + config.suffix.yaml]
44
+ # @coll.ensure_index({:key => Mongo::ASCENDING}, {:unique => true})
45
+ end
46
+
47
+ # Add entry to yaml collection, overwrite old data
48
+ # @param [ String ] module_name
49
+ # @param [ String ] key
50
+ # @param [ Object ] value YAML string
51
+ def add(module_name, key, value)
52
+ yaml_str = value.to_yaml
53
+ yaml_str.encode!('utf-8', :invalid => :replace, :undef => :replace)
54
+ @coll.update({:key => key,
55
+ :m_name => module_name},
56
+ {:$set => {:value => yaml_str}},
57
+ {:upsert => true})
58
+ end
59
+
60
+ # Find document with correspond key
61
+ # @param [ String ] module_name
62
+ # @param [ String ] key
63
+ # @return [ Object, nil ]
64
+ def get(module_name, key)
65
+ result = @coll.find_one({:key => key,
66
+ :m_name => module_name})
67
+ if (!result.nil?)
68
+ YAML.load(result['value'])
69
+ else
70
+ nil
71
+ end
72
+ end
73
+
74
+ # Get number of raw data entries
75
+ def count
76
+ @coll.count
77
+ end
78
+
79
+ def clear
80
+ @coll.remove
81
+ end
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,70 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ module NewsCrawler
24
+ module Storage
25
+ module YAMLStor
26
+ # Basic class for YAMLStor engine.
27
+ # Subclass and implement all its method to create new YAMLStor engine,
28
+ # you should keep methods' singature unchanged
29
+ class YAMLStorEngine
30
+ def self.inherited(klass)
31
+ @engine_list = (@engine_list || []) + [klass]
32
+ end
33
+
34
+ # Get engine list
35
+ # @return [ Array ] list of url queue engines
36
+ def self.get_engines
37
+ @engine_list = @engine_list || []
38
+ @engine_list.inject({}) do | memo, klass |
39
+ memo[klass::NAME.intern] = klass
40
+ memo
41
+ end
42
+ end
43
+
44
+ # Add entry to raw data collection
45
+ # @param [ String ] module_name
46
+ # @param [ String ] key
47
+ # @param [ Object ] value
48
+ def add(module_name, key, value)
49
+ raise NotImplementedError
50
+ end
51
+
52
+ # Get entry to raw data collection
53
+ # @param [ String ] module_name
54
+ # @param [ String ] key
55
+ # @return [ Object, nil ] Value or nil if key isn't found
56
+ def get(module_name, key)
57
+ raise NotImplementedError
58
+ end
59
+
60
+ def count
61
+ raise NotImplementedError
62
+ end
63
+
64
+ def clear
65
+ raise NotImplementedError
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,78 @@
1
+ # -*- coding: utf-8 -*-
2
+ #--
3
+ # NewsCrawler - a website crawler
4
+ #
5
+ # Copyright (C) 2013 - Hà Quang Dương <contact@haqduong.net>
6
+ #
7
+ # This file is part of NewsCrawler.
8
+ #
9
+ # NewsCrawler is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NewsCrawler is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License
20
+ # along with NewsCrawler. If not, see <http://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+ require 'simpleconfig'
24
+
25
+ #!!!
26
+ require 'news_crawler/storage/yaml_stor/yaml_stor_engine'
27
+ require 'news_crawler/storage/yaml_stor/mongo_storage'
28
+
29
+ module NewsCrawler
30
+ module Storage
31
+ # YAML data storage
32
+ # You can use it for store processed data or configuration
33
+ module YAMLStor
34
+ class << self
35
+ # Set YAMLStor storage engine
36
+ # @param [ Symbol, Object ] engine specify database engine, pass an object for custom engine
37
+ # @param [ Hash ] opts options pass to engine
38
+ # This can be
39
+ # * `:mongo`, `:mongodb` for MongoDB backend
40
+ def set_engine(engine, *opts)
41
+ if engine.respond_to? :intern
42
+ engine = engine.intern
43
+ end
44
+ engine_class = YAMLStorEngine.get_engines[engine]
45
+ if engine_class
46
+ @engine = engine_class.new(*opts)
47
+ else
48
+ @engine = engine
49
+ end
50
+ end
51
+
52
+ # Add entry to YAML storage
53
+ # @param [ String ] module_name
54
+ # @param [ String ] key
55
+ # @param [ String ] value object to serialize
56
+ def add(module_name, key, value)
57
+ @engine.add(module_name, key, value)
58
+ end
59
+
60
+ # Find document with correspond key
61
+ # @param [ String ] module_name
62
+ # @param [ String ] key
63
+ # @return [ Object, nil ]
64
+ def get(module_name, key)
65
+ @engine.get(module_name, key)
66
+ end
67
+
68
+ def count
69
+ @engine.count
70
+ end
71
+
72
+ def clear
73
+ @engine.clear
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4.pre.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hà Quang Dương
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-28 00:00:00.000000000 Z
11
+ date: 2013-08-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mongo
@@ -164,6 +164,7 @@ files:
164
164
  - lib/news_crawler/downloader.rb
165
165
  - lib/news_crawler/link_selector/same_domain_selector.rb
166
166
  - lib/news_crawler/nc_logger.rb
167
+ - lib/news_crawler/processing/structure_analysis.rb
167
168
  - lib/news_crawler/storage/raw_data.rb
168
169
  - lib/news_crawler/storage/raw_data/mongo_storage.rb
169
170
  - lib/news_crawler/storage/raw_data/raw_data_engine.rb
@@ -171,6 +172,9 @@ files:
171
172
  - lib/news_crawler/storage/url_queue/mongo_storage.rb
172
173
  - lib/news_crawler/storage/url_queue/url_queue_engine.rb
173
174
  - lib/news_crawler/storage/url_queue/url_queue_error.rb
175
+ - lib/news_crawler/storage/yaml_stor.rb
176
+ - lib/news_crawler/storage/yaml_stor/mongo_storage.rb
177
+ - lib/news_crawler/storage/yaml_stor/yaml_stor_engine.rb
174
178
  - lib/news_crawler/url_helper.rb
175
179
  - lib/news_crawler/utils/robots_patch.rb
176
180
  - lib/news_crawler/default_config.yml
@@ -191,9 +195,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
191
195
  version: 2.0.0
192
196
  required_rubygems_version: !ruby/object:Gem::Requirement
193
197
  requirements:
194
- - - '>='
198
+ - - '>'
195
199
  - !ruby/object:Gem::Version
196
- version: '0'
200
+ version: 1.3.1
197
201
  requirements: []
198
202
  rubyforge_project:
199
203
  rubygems_version: 2.0.3