rssdump 0.1.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rssdump/item.rb +1 -1
- data/lib/rssdump/scraper.rb +13 -8
- data/lib/rssdump/version.rb +1 -1
- data/lib/rssdump.rb +0 -1
- metadata +4 -21
- data/lib/rssdump/mongo_store.rb +0 -48
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4a064db5574bf273163d1194ea130f14f9124f48
|
4
|
+
data.tar.gz: 3aec73daf1261d65a697fbfdf113d33c22d21a33
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0b27cb123ebd759ff853f35d4ae763a056fc2696750284039c862bea6de4a5632871d822718c98dc666cedca93de069a18678881206dba410ec43e6cd5579bc2
|
7
|
+
data.tar.gz: c54ab35de1b51eaad4bd3baf42507f83a52cd7ba68b7af77e9c5eea13f4766c706223afcbafcb392ae0a34f5aef063d2225ae9e41456ddb6de62736a029e4975
|
data/lib/rssdump/item.rb
CHANGED
data/lib/rssdump/scraper.rb
CHANGED
@@ -7,12 +7,11 @@ module Rssdump
|
|
7
7
|
include Logging
|
8
8
|
include Cleaning
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
rss.items.map do |item|
|
10
|
+
def scrap feed_url
|
11
|
+
rss = SimpleRSS.parse ensure_valid(open(feed_url).read)
|
12
|
+
status = :ok
|
13
|
+
errors = []
|
14
|
+
items = rss.items.map do |item|
|
16
15
|
begin
|
17
16
|
ritem = Item.new
|
18
17
|
ritem.title = clean_html(item.title)
|
@@ -20,10 +19,11 @@ module Rssdump
|
|
20
19
|
ritem.description = clean_html(item.description)
|
21
20
|
ritem.pub_date = item.pubDate || item.updated
|
22
21
|
ritem.link = clean_link(item.link)
|
23
|
-
ritem.feed =
|
24
|
-
ritem.feed_name = feed_name
|
22
|
+
ritem.feed = feed_url
|
25
23
|
ritem
|
26
24
|
rescue => e
|
25
|
+
status = :ko
|
26
|
+
errors << e
|
27
27
|
logger.error "An error occurred during cleaning with item #{item.link}."
|
28
28
|
logger.error "#{e}\n#{e.backtrace.join("\n")}"
|
29
29
|
logger.warn "Ignoring item #{item.link}."
|
@@ -32,6 +32,11 @@ module Rssdump
|
|
32
32
|
end.select do |item|
|
33
33
|
!item.nil?
|
34
34
|
end
|
35
|
+
{
|
36
|
+
status: status,
|
37
|
+
errors: errors,
|
38
|
+
items: items
|
39
|
+
}
|
35
40
|
end
|
36
41
|
end
|
37
42
|
end
|
data/lib/rssdump/version.rb
CHANGED
data/lib/rssdump.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rssdump
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Damien Cram
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -66,20 +66,6 @@ dependencies:
|
|
66
66
|
- - ">="
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: mongo
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '2.1'
|
76
|
-
type: :runtime
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '2.1'
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
70
|
name: awesome_print
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -122,8 +108,7 @@ dependencies:
|
|
122
108
|
- - ">="
|
123
109
|
- !ruby/object:Gem::Version
|
124
110
|
version: '0'
|
125
|
-
description: Retrieves all items from an RSS feed and
|
126
|
-
Rssdump is based on simple-rss.
|
111
|
+
description: Retrieves all items from an RSS feed and clean them.
|
127
112
|
email:
|
128
113
|
- damien.cram@laposte.net
|
129
114
|
executables: []
|
@@ -134,7 +119,6 @@ files:
|
|
134
119
|
- lib/rssdump/cleaning.rb
|
135
120
|
- lib/rssdump/dumper.rb
|
136
121
|
- lib/rssdump/item.rb
|
137
|
-
- lib/rssdump/mongo_store.rb
|
138
122
|
- lib/rssdump/scraper.rb
|
139
123
|
- lib/rssdump/tasks.rb
|
140
124
|
- lib/rssdump/version.rb
|
@@ -160,6 +144,5 @@ rubyforge_project:
|
|
160
144
|
rubygems_version: 2.5.1
|
161
145
|
signing_key:
|
162
146
|
specification_version: 4
|
163
|
-
summary: Retrieves all items from an RSS feed and
|
164
|
-
Rssdump is based on simple-rss.
|
147
|
+
summary: Retrieves all items from an RSS feed and clean them.
|
165
148
|
test_files: []
|
data/lib/rssdump/mongo_store.rb
DELETED
@@ -1,48 +0,0 @@
|
|
1
|
-
require 'mongo'
|
2
|
-
|
3
|
-
module Rssdump
|
4
|
-
class MongoStore
|
5
|
-
DEFAULT_URL = "mongodb://127.0.0.1:27017/rssdump"
|
6
|
-
COLL_ITEMS = "items"
|
7
|
-
|
8
|
-
def initialize url
|
9
|
-
@url = url || DEFAULT_URL
|
10
|
-
client[COLL_ITEMS].indexes.create_one({ :link => 1 }, :unique => true)
|
11
|
-
client[COLL_ITEMS].indexes.create_one({ :pub_date => 1 }, :unique => false)
|
12
|
-
@logger = Logging.logger[self]
|
13
|
-
end
|
14
|
-
|
15
|
-
def upsert item
|
16
|
-
if client[COLL_ITEMS].find({link: item.link}).count == 0
|
17
|
-
@logger.debug "Inserting new item #{item.link} to store"
|
18
|
-
client[COLL_ITEMS].insert_one({
|
19
|
-
v: Rssdump::VERSION,
|
20
|
-
title: item.title,
|
21
|
-
link: item.link,
|
22
|
-
feed: item.feed,
|
23
|
-
feed_name: item.feed_name,
|
24
|
-
category: item.category,
|
25
|
-
description: item.description,
|
26
|
-
pub_date: item.pub_date,
|
27
|
-
})
|
28
|
-
true
|
29
|
-
else
|
30
|
-
false
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
def nb_items
|
35
|
-
client[COLL_ITEMS].count
|
36
|
-
end
|
37
|
-
|
38
|
-
def disk_usage_mb
|
39
|
-
client.database.command({dbStats: 1, scale: 1024**2}).first["dataSize"]
|
40
|
-
end
|
41
|
-
|
42
|
-
private
|
43
|
-
|
44
|
-
def client
|
45
|
-
@client ||= Mongo::Client.new(@url)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|