rssdump 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/rssdump/cleaning.rb +36 -0
- data/lib/rssdump/dumper.rb +69 -0
- data/lib/rssdump/item.rb +6 -0
- data/lib/rssdump/mongo_store.rb +48 -0
- data/lib/rssdump/scraper.rb +37 -0
- data/lib/rssdump/tasks.rb +37 -0
- data/lib/rssdump/version.rb +3 -0
- data/lib/rssdump.rb +8 -0
- metadata +165 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 95a5bf76f09fe0626531a62ffbabc791fd1ca4ac
|
4
|
+
data.tar.gz: 8ba6115c49373a2281dcb120fa29b334d28abbe9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1ffe505a06cf39cc23cd5dc40818163f32f00e75c1dc93d822f3fd797b198c6c96f1b3e06f0e6fb0d4295035a3a13d826ecf42f0844bcc26ca229123d8e1293e
|
7
|
+
data.tar.gz: 103b7df56f448c4e15c50b923b601716c15d98c54cc755b004723c96d0b8c4a7b8b4ff4d05816f51ea9e7041c61070b628f4df2e95ac31cd4a559c297eee2866
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'time'
|
3
|
+
|
4
|
+
module Rssdump
|
5
|
+
module Cleaning
|
6
|
+
URLS = /https?:\/\/[\S]+/
|
7
|
+
|
8
|
+
def clean_link url
|
9
|
+
(url || "").strip
|
10
|
+
end
|
11
|
+
|
12
|
+
def clean_html txt
|
13
|
+
if txt
|
14
|
+
ensure_valid txt
|
15
|
+
c = txt ? Nokogiri::HTML(CGI.unescapeHTML(txt)).text : ""
|
16
|
+
c = c.gsub(URLS, "").split("LIRE AUSSI")[0] || ""
|
17
|
+
c = c.gsub(/[\n\r\t ]+/, " ").strip
|
18
|
+
c
|
19
|
+
else
|
20
|
+
""
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse_pub_date str
|
25
|
+
Time.rfc2822 str
|
26
|
+
end
|
27
|
+
|
28
|
+
def ensure_valid txt
|
29
|
+
if !txt.valid_encoding?
|
30
|
+
txt.force_encoding("ISO-8859-1").encode("UTF-8")
|
31
|
+
else
|
32
|
+
txt
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'thread/pool'
|
2
|
+
|
3
|
+
module Rssdump
|
4
|
+
class Dumper
|
5
|
+
def initialize opt
|
6
|
+
@mongo_url = opt[:mongo_url] || raise("Missing opt :mongo_url")
|
7
|
+
@store = MongoStore.new @mongo_url
|
8
|
+
@logger = Logging.logger[self]
|
9
|
+
end
|
10
|
+
|
11
|
+
def dump_feed feed_name, feed_url
|
12
|
+
begin
|
13
|
+
scraper = Scraper.new
|
14
|
+
@logger.info "Scraping feed #{feed_name} - #{feed_url}"
|
15
|
+
items = scraper.scrap feed_url, feed_name
|
16
|
+
nb = 0
|
17
|
+
for item in items
|
18
|
+
inserted = @store.upsert item
|
19
|
+
nb+=1 if inserted
|
20
|
+
end
|
21
|
+
if nb > 0
|
22
|
+
@logger.info "#{nb} new items inserted for feed #{feed_name} - #{feed_url}"
|
23
|
+
else
|
24
|
+
@logger.debug "No new item inserted for feed #{feed_name} - #{feed_url}"
|
25
|
+
end
|
26
|
+
if(nb == items.count)
|
27
|
+
@logger.warn "All #{nb} items were new for feed #{feed_name}. Consider increasing the scraping frequency"
|
28
|
+
end
|
29
|
+
{status: 0, nb_inserted: nb}
|
30
|
+
rescue => e
|
31
|
+
@logger.error "An error occurred while scraping feed #{feed_name} - #{feed_url}"
|
32
|
+
@logger.error "#{e}\n#{e.backtrace.join("\n")}"
|
33
|
+
{status: 1, error: e}
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def dump file, opts = {threads: 1}
|
38
|
+
@logger.info "Starting a batch scrap from feed file #{file} [opts: #{opts}]"
|
39
|
+
pool = Thread.pool(opts[:threads])
|
40
|
+
mutex = Mutex.new
|
41
|
+
|
42
|
+
@start = Time.now
|
43
|
+
nb_inserted = 0
|
44
|
+
IO.readlines(file, :encoding => 'UTF-8').select do |l|
|
45
|
+
!l.strip.empty? && !l.strip.start_with?('#')
|
46
|
+
end.map do |l|
|
47
|
+
l.split("\t").map(&:strip)
|
48
|
+
end.each do |feed_name, feed_url|
|
49
|
+
pool.process do
|
50
|
+
res = dump_feed(feed_name, feed_url)
|
51
|
+
mutex.synchronize do
|
52
|
+
nb_inserted += res[:nb_inserted] if res[:status] == 0
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# wait until all tasks are finished
|
58
|
+
|
59
|
+
@logger.info "Waiting for all pools to process."
|
60
|
+
pool.shutdown
|
61
|
+
@logger.info "All pools processed !"
|
62
|
+
|
63
|
+
@logger.info "Finished dumping file #{file} in #{"%.3f" % (Time.now - @start)} seconds."
|
64
|
+
@logger.info "Num of new items inserted #{nb_inserted}."
|
65
|
+
@logger.info "Total number of items in db: #{@store.nb_items}."
|
66
|
+
@logger.info "Data size : #{ '%.2f' % @store.disk_usage_mb}Mb."
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/rssdump/item.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'mongo'
|
2
|
+
|
3
|
+
module Rssdump
|
4
|
+
class MongoStore
|
5
|
+
DEFAULT_URL = "mongodb://127.0.0.1:27017/rssdump"
|
6
|
+
COLL_ITEMS = "items"
|
7
|
+
|
8
|
+
def initialize url
|
9
|
+
@url = url || DEFAULT_URL
|
10
|
+
client[COLL_ITEMS].indexes.create_one({ :link => 1 }, :unique => true)
|
11
|
+
client[COLL_ITEMS].indexes.create_one({ :pub_date => 1 }, :unique => false)
|
12
|
+
@logger = Logging.logger[self]
|
13
|
+
end
|
14
|
+
|
15
|
+
def upsert item
|
16
|
+
if client[COLL_ITEMS].find({link: item.link}).count == 0
|
17
|
+
@logger.debug "Inserting new item #{item.link} to store"
|
18
|
+
client[COLL_ITEMS].insert_one({
|
19
|
+
v: Rssdump::VERSION,
|
20
|
+
title: item.title,
|
21
|
+
link: item.link,
|
22
|
+
feed: item.feed,
|
23
|
+
feed_name: item.feed_name,
|
24
|
+
category: item.category,
|
25
|
+
description: item.description,
|
26
|
+
pub_date: item.pub_date,
|
27
|
+
})
|
28
|
+
true
|
29
|
+
else
|
30
|
+
false
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def nb_items
|
35
|
+
client[COLL_ITEMS].count
|
36
|
+
end
|
37
|
+
|
38
|
+
def disk_usage_mb
|
39
|
+
client.database.command({dbStats: 1, scale: 1024**2}).first["dataSize"]
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def client
|
45
|
+
@client ||= Mongo::Client.new(@url)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'simple-rss'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
module Rssdump
|
6
|
+
class Scraper
|
7
|
+
include Logging
|
8
|
+
include Cleaning
|
9
|
+
|
10
|
+
attr_reader :errors
|
11
|
+
|
12
|
+
def scrap feed, feed_name = "_"
|
13
|
+
@errors = []
|
14
|
+
rss = SimpleRSS.parse ensure_valid(open(feed).read)
|
15
|
+
rss.items.map do |item|
|
16
|
+
begin
|
17
|
+
ritem = Item.new
|
18
|
+
ritem.title = clean_html(item.title)
|
19
|
+
ritem.category = clean_html(item.category)
|
20
|
+
ritem.description = clean_html(item.description)
|
21
|
+
ritem.pub_date = item.pubDate || item.updated
|
22
|
+
ritem.link = clean_link(item.link)
|
23
|
+
ritem.feed = feed
|
24
|
+
ritem.feed_name = feed_name
|
25
|
+
ritem
|
26
|
+
rescue => e
|
27
|
+
logger.error "An error occurred during cleaning with item #{item.link}."
|
28
|
+
logger.error "#{e}\n#{e.backtrace.join("\n")}"
|
29
|
+
logger.warn "Ignoring item #{item.link}."
|
30
|
+
nil
|
31
|
+
end
|
32
|
+
end.select do |item|
|
33
|
+
!item.nil?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require_relative "../rssdump"
|
2
|
+
require 'awesome_print'
|
3
|
+
|
4
|
+
def obj_to_hash obj
|
5
|
+
obj.instance_variables.each_with_object({}) do |var, hash|
|
6
|
+
hash[var.to_s.delete("@")] = obj.instance_variable_get(var)
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
namespace :collect do
|
11
|
+
desc "Retrieves a feed given a URL and prints the parsed items to the console"
|
12
|
+
task :feed do
|
13
|
+
url = ENV["URL"] || raise("Missing URL variable")
|
14
|
+
items = Rssdump::Scraper.new.scrap url
|
15
|
+
items.each do |item|
|
16
|
+
ap obj_to_hash(item)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
desc "Retrieves a feed given a URL, parses it using SimpleRSS and prints items to the console"
|
20
|
+
task :feed_simplerss do
|
21
|
+
url = ENV["URL"] || raise("Missing URL variable")
|
22
|
+
rss = SimpleRSS.parse open(url)
|
23
|
+
rss.items.map do |item|
|
24
|
+
ap '-'*80
|
25
|
+
ap "title: [#{item.title ? item.title.encoding : ""}]#{item.title}"
|
26
|
+
ap "description: [#{item.description ? item.description.encoding : ""}]#{item.description}"
|
27
|
+
ap "pubDate: #{item.pubDate}"
|
28
|
+
ap "category: [#{item.category ? item.category.encoding : ""}]#{item.category}"
|
29
|
+
ap "link: [#{item.link ? item.link.encoding : ""}]#{item.link}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
desc "Retrieves a feed given a URL and prints raw body to the console"
|
33
|
+
task :feed_raw do
|
34
|
+
url = ENV["URL"] || raise("Missing URL variable")
|
35
|
+
ap open(url).read
|
36
|
+
end
|
37
|
+
end
|
data/lib/rssdump.rb
ADDED
metadata
ADDED
@@ -0,0 +1,165 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rssdump
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Damien Cram
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-05-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: logging
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: simple-rss
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: thread
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: mongo
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '2.1'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '2.1'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: awesome_print
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: rake
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description: Retrieves all items from an RSS feed and stores them to a MongoDB collection.
|
126
|
+
Rssdump is based on simple-rss.
|
127
|
+
email:
|
128
|
+
- damien.cram@laposte.net
|
129
|
+
executables: []
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files: []
|
132
|
+
files:
|
133
|
+
- lib/rssdump.rb
|
134
|
+
- lib/rssdump/cleaning.rb
|
135
|
+
- lib/rssdump/dumper.rb
|
136
|
+
- lib/rssdump/item.rb
|
137
|
+
- lib/rssdump/mongo_store.rb
|
138
|
+
- lib/rssdump/scraper.rb
|
139
|
+
- lib/rssdump/tasks.rb
|
140
|
+
- lib/rssdump/version.rb
|
141
|
+
homepage: http://github.com/pompadour/rssdump.git
|
142
|
+
licenses: []
|
143
|
+
metadata: {}
|
144
|
+
post_install_message:
|
145
|
+
rdoc_options: []
|
146
|
+
require_paths:
|
147
|
+
- lib
|
148
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
154
|
+
requirements:
|
155
|
+
- - ">="
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
158
|
+
requirements: []
|
159
|
+
rubyforge_project:
|
160
|
+
rubygems_version: 2.5.1
|
161
|
+
signing_key:
|
162
|
+
specification_version: 4
|
163
|
+
summary: Retrieves all items from an RSS feed and stores them to a MongoDB collection.
|
164
|
+
Rssdump is based on simple-rss.
|
165
|
+
test_files: []
|