news_crawler 0.0.2 → 0.0.3.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 50e84e9674b22d98be7b72371513219da5a23d38
4
- data.tar.gz: bd6cd50fe658c960134fdfff53631e32e63e4b76
3
+ metadata.gz: e52eaf3135e05aba597c96a7e9cafa43dba73cc8
4
+ data.tar.gz: 7e50f113f2b5be1f4932eb7ba83975e216777c89
5
5
  SHA512:
6
- metadata.gz: 405419795794f78bf0608d2b66707d842c69c7c53d1b312a2569e9c82e482563ea28d34c5183accd3bf4b213fc6291305d8295f9310918854cd24e19c1cf6a83
7
- data.tar.gz: 53e089fb98b6a9e583c54119215ead3444fcc285d71f3bf5ca6f188b55db13a45fa6a87d414631a81dc2f092cedc726d77907d2d542671a721d92581a4876df8
6
+ metadata.gz: 1ce7400c42047ad78954b0e4f45e1493fb7833926e69864a0814662e5ddac4f6423247083fe943fbab33085ad27c410dbb500326690c74290821bf3b24b2454a
7
+ data.tar.gz: 66f0fd7d05b3fae8e9c12684b16240e2e9ae0add263018ec49e82564d8b1489a9564a3e8899a1a975a145500c5eee9afe002a9644b368bc0cf2c5206328d71fd
@@ -22,6 +22,7 @@
22
22
 
23
23
  require 'celluloid'
24
24
  require 'nokogiri'
25
+ require 'uri'
25
26
 
26
27
  require 'news_crawler/storage/raw_data'
27
28
  require 'news_crawler/url_helper'
@@ -60,13 +61,14 @@ module NewsCrawler
60
61
  inner_url = html_doc.xpath('//a').collect { | a_el |
61
62
  temp_url = (a_el.attribute 'href').to_s
62
63
  if (!temp_url.nil?) && (temp_url[0] == '/')
63
- temp_url = url + temp_url
64
+ temp_url = URI.join(url, temp_url).to_s
64
65
  end
65
66
  temp_url
66
67
  }
67
68
 
68
69
  inner_url.delete_if { | url |
69
- (url.nil?) || (url.size == 0) || (url == '#')
70
+ (url.nil?) || (url.size == 0) || (url == '#') ||
71
+ (url == 'javascript:;')
70
72
  }
71
73
 
72
74
  # select url from same domain
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  #--
2
3
  # NewsCrawler - a website crawler
3
4
  #
@@ -22,11 +23,14 @@
22
23
  module NewsCrawler
23
24
  # Contains various method for processing url
24
25
  module URLHelper
25
- # produce true if 2 urls belong to same domain
26
+ # produce true if 2 urls belong to same domain, or url is start with '/'
26
27
  # @param [ String ] url1 Url 1
27
28
  # @param [ String ] url2 Url 2
28
29
  # @return [ Boolean ] true if both url belong to same domain
29
30
  def same_domain?(url1, url2)
31
+ if (url1[0] == '/') || (url2[0] == '/')
32
+ return true
33
+ end
30
34
  p1 = get_url_path(url1)
31
35
  p2 = get_url_path(url2)
32
36
  d1 = p1[:domain].split('.').reverse
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3.pre.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hà Quang Dương
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-22 00:00:00.000000000 Z
11
+ date: 2013-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mongo
@@ -191,9 +191,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
191
191
  version: 2.0.0
192
192
  required_rubygems_version: !ruby/object:Gem::Requirement
193
193
  requirements:
194
- - - '>='
194
+ - - '>'
195
195
  - !ruby/object:Gem::Version
196
- version: '0'
196
+ version: 1.3.1
197
197
  requirements: []
198
198
  rubyforge_project:
199
199
  rubygems_version: 2.0.3