rssdump 0.1.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rssdump/item.rb +1 -1
- data/lib/rssdump/scraper.rb +13 -8
- data/lib/rssdump/version.rb +1 -1
- data/lib/rssdump.rb +0 -1
- metadata +4 -21
- data/lib/rssdump/mongo_store.rb +0 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4a064db5574bf273163d1194ea130f14f9124f48
|
4
|
+
data.tar.gz: 3aec73daf1261d65a697fbfdf113d33c22d21a33
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0b27cb123ebd759ff853f35d4ae763a056fc2696750284039c862bea6de4a5632871d822718c98dc666cedca93de069a18678881206dba410ec43e6cd5579bc2
|
7
|
+
data.tar.gz: c54ab35de1b51eaad4bd3baf42507f83a52cd7ba68b7af77e9c5eea13f4766c706223afcbafcb392ae0a34f5aef063d2225ae9e41456ddb6de62736a029e4975
|
data/lib/rssdump/item.rb
CHANGED
data/lib/rssdump/scraper.rb
CHANGED
@@ -7,12 +7,11 @@ module Rssdump
|
|
7
7
|
include Logging
|
8
8
|
include Cleaning
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
rss.items.map do |item|
|
10
|
+
def scrap feed_url
|
11
|
+
rss = SimpleRSS.parse ensure_valid(open(feed_url).read)
|
12
|
+
status = :ok
|
13
|
+
errors = []
|
14
|
+
items = rss.items.map do |item|
|
16
15
|
begin
|
17
16
|
ritem = Item.new
|
18
17
|
ritem.title = clean_html(item.title)
|
@@ -20,10 +19,11 @@ module Rssdump
|
|
20
19
|
ritem.description = clean_html(item.description)
|
21
20
|
ritem.pub_date = item.pubDate || item.updated
|
22
21
|
ritem.link = clean_link(item.link)
|
23
|
-
ritem.feed =
|
24
|
-
ritem.feed_name = feed_name
|
22
|
+
ritem.feed = feed_url
|
25
23
|
ritem
|
26
24
|
rescue => e
|
25
|
+
status = :ko
|
26
|
+
errors << e
|
27
27
|
logger.error "An error occurred during cleaning with item #{item.link}."
|
28
28
|
logger.error "#{e}\n#{e.backtrace.join("\n")}"
|
29
29
|
logger.warn "Ignoring item #{item.link}."
|
@@ -32,6 +32,11 @@ module Rssdump
|
|
32
32
|
end.select do |item|
|
33
33
|
!item.nil?
|
34
34
|
end
|
35
|
+
{
|
36
|
+
status: status,
|
37
|
+
errors: errors,
|
38
|
+
items: items
|
39
|
+
}
|
35
40
|
end
|
36
41
|
end
|
37
42
|
end
|
data/lib/rssdump/version.rb
CHANGED
data/lib/rssdump.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rssdump
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Damien Cram
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -66,20 +66,6 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: mongo
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '2.1'
|
76
|
-
type: :runtime
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '2.1'
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: awesome_print
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,8 +108,7 @@ dependencies:
|
|
122
108
|
- - ">="
|
123
109
|
- !ruby/object:Gem::Version
|
124
110
|
version: '0'
|
125
|
-
description: Retrieves all items from an RSS feed and
|
126
|
-
Rssdump is based on simple-rss.
|
111
|
+
description: Retrieves all items from an RSS feed and clean them.
|
127
112
|
email:
|
128
113
|
- damien.cram@laposte.net
|
129
114
|
executables: []
|
@@ -134,7 +119,6 @@ files:
|
|
134
119
|
- lib/rssdump/cleaning.rb
|
135
120
|
- lib/rssdump/dumper.rb
|
136
121
|
- lib/rssdump/item.rb
|
137
|
-
- lib/rssdump/mongo_store.rb
|
138
122
|
- lib/rssdump/scraper.rb
|
139
123
|
- lib/rssdump/tasks.rb
|
140
124
|
- lib/rssdump/version.rb
|
@@ -160,6 +144,5 @@ rubyforge_project:
|
|
160
144
|
rubygems_version: 2.5.1
|
161
145
|
signing_key:
|
162
146
|
specification_version: 4
|
163
|
-
summary: Retrieves all items from an RSS feed and
|
164
|
-
Rssdump is based on simple-rss.
|
147
|
+
summary: Retrieves all items from an RSS feed and clean them.
|
165
148
|
test_files: []
|
data/lib/rssdump/mongo_store.rb
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
require 'mongo'
|
2
|
-
|
3
|
-
module Rssdump
|
4
|
-
class MongoStore
|
5
|
-
DEFAULT_URL = "mongodb://127.0.0.1:27017/rssdump"
|
6
|
-
COLL_ITEMS = "items"
|
7
|
-
|
8
|
-
def initialize url
|
9
|
-
@url = url || DEFAULT_URL
|
10
|
-
client[COLL_ITEMS].indexes.create_one({ :link => 1 }, :unique => true)
|
11
|
-
client[COLL_ITEMS].indexes.create_one({ :pub_date => 1 }, :unique => false)
|
12
|
-
@logger = Logging.logger[self]
|
13
|
-
end
|
14
|
-
|
15
|
-
def upsert item
|
16
|
-
if client[COLL_ITEMS].find({link: item.link}).count == 0
|
17
|
-
@logger.debug "Inserting new item #{item.link} to store"
|
18
|
-
client[COLL_ITEMS].insert_one({
|
19
|
-
v: Rssdump::VERSION,
|
20
|
-
title: item.title,
|
21
|
-
link: item.link,
|
22
|
-
feed: item.feed,
|
23
|
-
feed_name: item.feed_name,
|
24
|
-
category: item.category,
|
25
|
-
description: item.description,
|
26
|
-
pub_date: item.pub_date,
|
27
|
-
})
|
28
|
-
true
|
29
|
-
else
|
30
|
-
false
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def nb_items
|
35
|
-
client[COLL_ITEMS].count
|
36
|
-
end
|
37
|
-
|
38
|
-
def disk_usage_mb
|
39
|
-
client.database.command({dbStats: 1, scale: 1024**2}).first["dataSize"]
|
40
|
-
end
|
41
|
-
|
42
|
-
private
|
43
|
-
|
44
|
-
def client
|
45
|
-
@client ||= Mongo::Client.new(@url)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|