news_crawler 0.0.2 → 0.0.3.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e52eaf3135e05aba597c96a7e9cafa43dba73cc8
|
4
|
+
data.tar.gz: 7e50f113f2b5be1f4932eb7ba83975e216777c89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1ce7400c42047ad78954b0e4f45e1493fb7833926e69864a0814662e5ddac4f6423247083fe943fbab33085ad27c410dbb500326690c74290821bf3b24b2454a
|
7
|
+
data.tar.gz: 66f0fd7d05b3fae8e9c12684b16240e2e9ae0add263018ec49e82564d8b1489a9564a3e8899a1a975a145500c5eee9afe002a9644b368bc0cf2c5206328d71fd
|
@@ -22,6 +22,7 @@
|
|
22
22
|
|
23
23
|
require 'celluloid'
|
24
24
|
require 'nokogiri'
|
25
|
+
require 'uri'
|
25
26
|
|
26
27
|
require 'news_crawler/storage/raw_data'
|
27
28
|
require 'news_crawler/url_helper'
|
@@ -60,13 +61,14 @@ module NewsCrawler
|
|
60
61
|
inner_url = html_doc.xpath('//a').collect { | a_el |
|
61
62
|
temp_url = (a_el.attribute 'href').to_s
|
62
63
|
if (!temp_url.nil?) && (temp_url[0] == '/')
|
63
|
-
temp_url = url
|
64
|
+
temp_url = URI.join(url, temp_url).to_s
|
64
65
|
end
|
65
66
|
temp_url
|
66
67
|
}
|
67
68
|
|
68
69
|
inner_url.delete_if { | url |
|
69
|
-
(url.nil?) || (url.size == 0) || (url == '#')
|
70
|
+
(url.nil?) || (url.size == 0) || (url == '#') ||
|
71
|
+
(url == 'javascript:;')
|
70
72
|
}
|
71
73
|
|
72
74
|
# select url from same domain
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
#--
|
2
3
|
# NewsCrawler - a website crawler
|
3
4
|
#
|
@@ -22,11 +23,14 @@
|
|
22
23
|
module NewsCrawler
|
23
24
|
# Contains various method for processing url
|
24
25
|
module URLHelper
|
25
|
-
# produce true if 2 urls belong to same domain
|
26
|
+
# produce true if 2 urls belong to same domain, or url is start with '/'
|
26
27
|
# @param [ String ] url1 Url 1
|
27
28
|
# @param [ String ] url2 Url 2
|
28
29
|
# @return [ Boolean ] true if both url belong to same domain
|
29
30
|
def same_domain?(url1, url2)
|
31
|
+
if (url1[0] == '/') || (url2[0] == '/')
|
32
|
+
return true
|
33
|
+
end
|
30
34
|
p1 = get_url_path(url1)
|
31
35
|
p2 = get_url_path(url2)
|
32
36
|
d1 = p1[:domain].split('.').reverse
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: news_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3.pre.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hà Quang Dương
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mongo
|
@@ -191,9 +191,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
191
191
|
version: 2.0.0
|
192
192
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
193
193
|
requirements:
|
194
|
-
- - '
|
194
|
+
- - '>'
|
195
195
|
- !ruby/object:Gem::Version
|
196
|
-
version:
|
196
|
+
version: 1.3.1
|
197
197
|
requirements: []
|
198
198
|
rubyforge_project:
|
199
199
|
rubygems_version: 2.0.3
|