news_crawler 0.0.2 → 0.0.3.pre.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 50e84e9674b22d98be7b72371513219da5a23d38
4
- data.tar.gz: bd6cd50fe658c960134fdfff53631e32e63e4b76
3
+ metadata.gz: e52eaf3135e05aba597c96a7e9cafa43dba73cc8
4
+ data.tar.gz: 7e50f113f2b5be1f4932eb7ba83975e216777c89
5
5
  SHA512:
6
- metadata.gz: 405419795794f78bf0608d2b66707d842c69c7c53d1b312a2569e9c82e482563ea28d34c5183accd3bf4b213fc6291305d8295f9310918854cd24e19c1cf6a83
7
- data.tar.gz: 53e089fb98b6a9e583c54119215ead3444fcc285d71f3bf5ca6f188b55db13a45fa6a87d414631a81dc2f092cedc726d77907d2d542671a721d92581a4876df8
6
+ metadata.gz: 1ce7400c42047ad78954b0e4f45e1493fb7833926e69864a0814662e5ddac4f6423247083fe943fbab33085ad27c410dbb500326690c74290821bf3b24b2454a
7
+ data.tar.gz: 66f0fd7d05b3fae8e9c12684b16240e2e9ae0add263018ec49e82564d8b1489a9564a3e8899a1a975a145500c5eee9afe002a9644b368bc0cf2c5206328d71fd
@@ -22,6 +22,7 @@
22
22
 
23
23
  require 'celluloid'
24
24
  require 'nokogiri'
25
+ require 'uri'
25
26
 
26
27
  require 'news_crawler/storage/raw_data'
27
28
  require 'news_crawler/url_helper'
@@ -60,13 +61,14 @@ module NewsCrawler
60
61
  inner_url = html_doc.xpath('//a').collect { | a_el |
61
62
  temp_url = (a_el.attribute 'href').to_s
62
63
  if (!temp_url.nil?) && (temp_url[0] == '/')
63
- temp_url = url + temp_url
64
+ temp_url = URI.join(url, temp_url).to_s
64
65
  end
65
66
  temp_url
66
67
  }
67
68
 
68
69
  inner_url.delete_if { | url |
69
- (url.nil?) || (url.size == 0) || (url == '#')
70
+ (url.nil?) || (url.size == 0) || (url == '#') ||
71
+ (url == 'javascript:;')
70
72
  }
71
73
 
72
74
  # select url from same domain
@@ -1,3 +1,4 @@
1
+ # -*- coding: utf-8 -*-
1
2
  #--
2
3
  # NewsCrawler - a website crawler
3
4
  #
@@ -22,11 +23,14 @@
22
23
  module NewsCrawler
23
24
  # Contains various method for processing url
24
25
  module URLHelper
25
- # produce true if 2 urls belong to same domain
26
+ # produce true if 2 urls belong to same domain, or url is start with '/'
26
27
  # @param [ String ] url1 Url 1
27
28
  # @param [ String ] url2 Url 2
28
29
  # @return [ Boolean ] true if both url belong to same domain
29
30
  def same_domain?(url1, url2)
31
+ if (url1[0] == '/') || (url2[0] == '/')
32
+ return true
33
+ end
30
34
  p1 = get_url_path(url1)
31
35
  p2 = get_url_path(url2)
32
36
  d1 = p1[:domain].split('.').reverse
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: news_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3.pre.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hà Quang Dương
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-22 00:00:00.000000000 Z
11
+ date: 2013-07-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mongo
@@ -191,9 +191,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
191
191
  version: 2.0.0
192
192
  required_rubygems_version: !ruby/object:Gem::Requirement
193
193
  requirements:
194
- - - '>='
194
+ - - '>'
195
195
  - !ruby/object:Gem::Version
196
- version: '0'
196
+ version: 1.3.1
197
197
  requirements: []
198
198
  rubyforge_project:
199
199
  rubygems_version: 2.0.3