news_crawler 0.0.2 → 0.0.3.pre.1
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e52eaf3135e05aba597c96a7e9cafa43dba73cc8
|
4
|
+
data.tar.gz: 7e50f113f2b5be1f4932eb7ba83975e216777c89
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1ce7400c42047ad78954b0e4f45e1493fb7833926e69864a0814662e5ddac4f6423247083fe943fbab33085ad27c410dbb500326690c74290821bf3b24b2454a
|
7
|
+
data.tar.gz: 66f0fd7d05b3fae8e9c12684b16240e2e9ae0add263018ec49e82564d8b1489a9564a3e8899a1a975a145500c5eee9afe002a9644b368bc0cf2c5206328d71fd
|
@@ -22,6 +22,7 @@
|
|
22
22
|
|
23
23
|
require 'celluloid'
|
24
24
|
require 'nokogiri'
|
25
|
+
require 'uri'
|
25
26
|
|
26
27
|
require 'news_crawler/storage/raw_data'
|
27
28
|
require 'news_crawler/url_helper'
|
@@ -60,13 +61,14 @@ module NewsCrawler
|
|
60
61
|
inner_url = html_doc.xpath('//a').collect { | a_el |
|
61
62
|
temp_url = (a_el.attribute 'href').to_s
|
62
63
|
if (!temp_url.nil?) && (temp_url[0] == '/')
|
63
|
-
temp_url = url
|
64
|
+
temp_url = URI.join(url, temp_url).to_s
|
64
65
|
end
|
65
66
|
temp_url
|
66
67
|
}
|
67
68
|
|
68
69
|
inner_url.delete_if { | url |
|
69
|
-
(url.nil?) || (url.size == 0) || (url == '#')
|
70
|
+
(url.nil?) || (url.size == 0) || (url == '#') ||
|
71
|
+
(url == 'javascript:;')
|
70
72
|
}
|
71
73
|
|
72
74
|
# select url from same domain
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
1
2
|
#--
|
2
3
|
# NewsCrawler - a website crawler
|
3
4
|
#
|
@@ -22,11 +23,14 @@
|
|
22
23
|
module NewsCrawler
|
23
24
|
# Contains various method for processing url
|
24
25
|
module URLHelper
|
25
|
-
# produce true if 2 urls belong to same domain
|
26
|
+
# produce true if 2 urls belong to same domain, or url is start with '/'
|
26
27
|
# @param [ String ] url1 Url 1
|
27
28
|
# @param [ String ] url2 Url 2
|
28
29
|
# @return [ Boolean ] true if both url belong to same domain
|
29
30
|
def same_domain?(url1, url2)
|
31
|
+
if (url1[0] == '/') || (url2[0] == '/')
|
32
|
+
return true
|
33
|
+
end
|
30
34
|
p1 = get_url_path(url1)
|
31
35
|
p2 = get_url_path(url2)
|
32
36
|
d1 = p1[:domain].split('.').reverse
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: news_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3.pre.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hà Quang Dương
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-07-
|
11
|
+
date: 2013-07-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mongo
|
@@ -191,9 +191,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
191
191
|
version: 2.0.0
|
192
192
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
193
193
|
requirements:
|
194
|
-
- - '
|
194
|
+
- - '>'
|
195
195
|
- !ruby/object:Gem::Version
|
196
|
-
version:
|
196
|
+
version: 1.3.1
|
197
197
|
requirements: []
|
198
198
|
rubyforge_project:
|
199
199
|
rubygems_version: 2.0.3
|