simple-news-crawler 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sn_crawler.rb +8 -5
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 24fbc21d296bb28b773bb2fb6d2f955a73b67c4c
|
4
|
+
data.tar.gz: 09ddb5446af9e6b2b3a2961f9c897b02f81f0c95
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 905f48fe28a797e4b375aa101ab42ff15c119c14d25356e5f4ebbe5b32d0b0fdf2b60659fa07f22e6ad6ac36442dbc2d9a3b80ed158a9dbd54cdf898df352e4f
|
7
|
+
data.tar.gz: 4dd94b7507d88c73fde7705f592623a65116f1ffd037d5774eea51b8d78beb8d80ba5358360b00ed25177ba338a3a262ad732435d0d04b713e9fd014097419b0
|
data/lib/sn_crawler.rb
CHANGED
@@ -74,7 +74,7 @@ class SNCrawler
|
|
74
74
|
page.links_with(:href => /\.(rss|xml)/).each do |link|
|
75
75
|
src = ""
|
76
76
|
if !link.href.include? "http"
|
77
|
-
src = URI.parse(source).host + link.href
|
77
|
+
src = URI.parse(@source).host + link.href
|
78
78
|
else
|
79
79
|
src = link.href
|
80
80
|
end
|
@@ -126,16 +126,19 @@ class SNCrawler
|
|
126
126
|
request = Curl.get(u.to_s)
|
127
127
|
begin
|
128
128
|
source = XML::Parser.string(request.body_str)
|
129
|
-
|
129
|
+
root_content = source.parse
|
130
130
|
## Find all channels
|
131
|
-
channels =
|
131
|
+
channels = root_content.root.find(channel_path)
|
132
132
|
## For each channel processing the data
|
133
133
|
channels.each do |c|
|
134
|
-
|
135
|
-
if
|
134
|
+
lang_field = c.find_first('language')
|
135
|
+
if lang_field.nil? then
|
136
136
|
lang = "en_US"
|
137
|
+
else
|
138
|
+
lang = lang_field.content.to_s
|
137
139
|
end
|
138
140
|
items = c.find(item_tag)
|
141
|
+
puts item_tag
|
139
142
|
items.each do |i|
|
140
143
|
title = i.find_first(title_tag).content
|
141
144
|
title = title.gsub("'","")
|