rssdump 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/rssdump/cleaning.rb +36 -0
- data/lib/rssdump/dumper.rb +69 -0
- data/lib/rssdump/item.rb +6 -0
- data/lib/rssdump/mongo_store.rb +48 -0
- data/lib/rssdump/scraper.rb +37 -0
- data/lib/rssdump/tasks.rb +37 -0
- data/lib/rssdump/version.rb +3 -0
- data/lib/rssdump.rb +8 -0
- metadata +165 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 95a5bf76f09fe0626531a62ffbabc791fd1ca4ac
|
4
|
+
data.tar.gz: 8ba6115c49373a2281dcb120fa29b334d28abbe9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1ffe505a06cf39cc23cd5dc40818163f32f00e75c1dc93d822f3fd797b198c6c96f1b3e06f0e6fb0d4295035a3a13d826ecf42f0844bcc26ca229123d8e1293e
|
7
|
+
data.tar.gz: 103b7df56f448c4e15c50b923b601716c15d98c54cc755b004723c96d0b8c4a7b8b4ff4d05816f51ea9e7041c61070b628f4df2e95ac31cd4a559c297eee2866
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'time'
|
3
|
+
|
4
|
+
module Rssdump
|
5
|
+
module Cleaning
|
6
|
+
URLS = /https?:\/\/[\S]+/
|
7
|
+
|
8
|
+
def clean_link url
|
9
|
+
(url || "").strip
|
10
|
+
end
|
11
|
+
|
12
|
+
def clean_html txt
|
13
|
+
if txt
|
14
|
+
ensure_valid txt
|
15
|
+
c = txt ? Nokogiri::HTML(CGI.unescapeHTML(txt)).text : ""
|
16
|
+
c = c.gsub(URLS, "").split("LIRE AUSSI")[0] || ""
|
17
|
+
c = c.gsub(/[\n\r\t ]+/, " ").strip
|
18
|
+
c
|
19
|
+
else
|
20
|
+
""
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse_pub_date str
|
25
|
+
Time.rfc2822 str
|
26
|
+
end
|
27
|
+
|
28
|
+
def ensure_valid txt
|
29
|
+
if !txt.valid_encoding?
|
30
|
+
txt.force_encoding("ISO-8859-1").encode("UTF-8")
|
31
|
+
else
|
32
|
+
txt
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'thread/pool'
|
2
|
+
|
3
|
+
module Rssdump
|
4
|
+
class Dumper
|
5
|
+
def initialize opt
|
6
|
+
@mongo_url = opt[:mongo_url] || raise("Missing opt :mongo_url")
|
7
|
+
@store = MongoStore.new @mongo_url
|
8
|
+
@logger = Logging.logger[self]
|
9
|
+
end
|
10
|
+
|
11
|
+
def dump_feed feed_name, feed_url
|
12
|
+
begin
|
13
|
+
scraper = Scraper.new
|
14
|
+
@logger.info "Scraping feed #{feed_name} - #{feed_url}"
|
15
|
+
items = scraper.scrap feed_url, feed_name
|
16
|
+
nb = 0
|
17
|
+
for item in items
|
18
|
+
inserted = @store.upsert item
|
19
|
+
nb+=1 if inserted
|
20
|
+
end
|
21
|
+
if nb > 0
|
22
|
+
@logger.info "#{nb} new items inserted for feed #{feed_name} - #{feed_url}"
|
23
|
+
else
|
24
|
+
@logger.debug "No new item inserted for feed #{feed_name} - #{feed_url}"
|
25
|
+
end
|
26
|
+
if(nb == items.count)
|
27
|
+
@logger.warn "All #{nb} items were new for feed #{feed_name}. Consider increasing the scraping frequency"
|
28
|
+
end
|
29
|
+
{status: 0, nb_inserted: nb}
|
30
|
+
rescue => e
|
31
|
+
@logger.error "An error occurred while scraping feed #{feed_name} - #{feed_url}"
|
32
|
+
@logger.error "#{e}\n#{e.backtrace.join("\n")}"
|
33
|
+
{status: 1, error: e}
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def dump file, opts = {threads: 1}
|
38
|
+
@logger.info "Starting a batch scrap from feed file #{file} [opts: #{opts}]"
|
39
|
+
pool = Thread.pool(opts[:threads])
|
40
|
+
mutex = Mutex.new
|
41
|
+
|
42
|
+
@start = Time.now
|
43
|
+
nb_inserted = 0
|
44
|
+
IO.readlines(file, :encoding => 'UTF-8').select do |l|
|
45
|
+
!l.strip.empty? && !l.strip.start_with?('#')
|
46
|
+
end.map do |l|
|
47
|
+
l.split("\t").map(&:strip)
|
48
|
+
end.each do |feed_name, feed_url|
|
49
|
+
pool.process do
|
50
|
+
res = dump_feed(feed_name, feed_url)
|
51
|
+
mutex.synchronize do
|
52
|
+
nb_inserted += res[:nb_inserted] if res[:status] == 0
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# wait until all tasks are finished
|
58
|
+
|
59
|
+
@logger.info "Waiting for all pools to process."
|
60
|
+
pool.shutdown
|
61
|
+
@logger.info "All pools processed !"
|
62
|
+
|
63
|
+
@logger.info "Finished dumping file #{file} in #{"%.3f" % (Time.now - @start)} seconds."
|
64
|
+
@logger.info "Num of new items inserted #{nb_inserted}."
|
65
|
+
@logger.info "Total number of items in db: #{@store.nb_items}."
|
66
|
+
@logger.info "Data size : #{ '%.2f' % @store.disk_usage_mb}Mb."
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
data/lib/rssdump/item.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'mongo'
|
2
|
+
|
3
|
+
module Rssdump
|
4
|
+
class MongoStore
|
5
|
+
DEFAULT_URL = "mongodb://127.0.0.1:27017/rssdump"
|
6
|
+
COLL_ITEMS = "items"
|
7
|
+
|
8
|
+
def initialize url
|
9
|
+
@url = url || DEFAULT_URL
|
10
|
+
client[COLL_ITEMS].indexes.create_one({ :link => 1 }, :unique => true)
|
11
|
+
client[COLL_ITEMS].indexes.create_one({ :pub_date => 1 }, :unique => false)
|
12
|
+
@logger = Logging.logger[self]
|
13
|
+
end
|
14
|
+
|
15
|
+
def upsert item
|
16
|
+
if client[COLL_ITEMS].find({link: item.link}).count == 0
|
17
|
+
@logger.debug "Inserting new item #{item.link} to store"
|
18
|
+
client[COLL_ITEMS].insert_one({
|
19
|
+
v: Rssdump::VERSION,
|
20
|
+
title: item.title,
|
21
|
+
link: item.link,
|
22
|
+
feed: item.feed,
|
23
|
+
feed_name: item.feed_name,
|
24
|
+
category: item.category,
|
25
|
+
description: item.description,
|
26
|
+
pub_date: item.pub_date,
|
27
|
+
})
|
28
|
+
true
|
29
|
+
else
|
30
|
+
false
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def nb_items
|
35
|
+
client[COLL_ITEMS].count
|
36
|
+
end
|
37
|
+
|
38
|
+
def disk_usage_mb
|
39
|
+
client.database.command({dbStats: 1, scale: 1024**2}).first["dataSize"]
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def client
|
45
|
+
@client ||= Mongo::Client.new(@url)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'simple-rss'
|
3
|
+
require 'open-uri'
|
4
|
+
|
5
|
+
module Rssdump
|
6
|
+
class Scraper
|
7
|
+
include Logging
|
8
|
+
include Cleaning
|
9
|
+
|
10
|
+
attr_reader :errors
|
11
|
+
|
12
|
+
def scrap feed, feed_name = "_"
|
13
|
+
@errors = []
|
14
|
+
rss = SimpleRSS.parse ensure_valid(open(feed).read)
|
15
|
+
rss.items.map do |item|
|
16
|
+
begin
|
17
|
+
ritem = Item.new
|
18
|
+
ritem.title = clean_html(item.title)
|
19
|
+
ritem.category = clean_html(item.category)
|
20
|
+
ritem.description = clean_html(item.description)
|
21
|
+
ritem.pub_date = item.pubDate || item.updated
|
22
|
+
ritem.link = clean_link(item.link)
|
23
|
+
ritem.feed = feed
|
24
|
+
ritem.feed_name = feed_name
|
25
|
+
ritem
|
26
|
+
rescue => e
|
27
|
+
logger.error "An error occurred during cleaning with item #{item.link}."
|
28
|
+
logger.error "#{e}\n#{e.backtrace.join("\n")}"
|
29
|
+
logger.warn "Ignoring item #{item.link}."
|
30
|
+
nil
|
31
|
+
end
|
32
|
+
end.select do |item|
|
33
|
+
!item.nil?
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require_relative "../rssdump"
|
2
|
+
require 'awesome_print'
|
3
|
+
|
4
|
+
def obj_to_hash obj
|
5
|
+
obj.instance_variables.each_with_object({}) do |var, hash|
|
6
|
+
hash[var.to_s.delete("@")] = obj.instance_variable_get(var)
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
namespace :collect do
|
11
|
+
desc "Retrieves a feed given a URL and prints the parsed items to the console"
|
12
|
+
task :feed do
|
13
|
+
url = ENV["URL"] || raise("Missing URL variable")
|
14
|
+
items = Rssdump::Scraper.new.scrap url
|
15
|
+
items.each do |item|
|
16
|
+
ap obj_to_hash(item)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
desc "Retrieves a feed given a URL, parses it using SimpleRSS and prints items to the console"
|
20
|
+
task :feed_simplerss do
|
21
|
+
url = ENV["URL"] || raise("Missing URL variable")
|
22
|
+
rss = SimpleRSS.parse open(url)
|
23
|
+
rss.items.map do |item|
|
24
|
+
ap '-'*80
|
25
|
+
ap "title: [#{item.title ? item.title.encoding : ""}]#{item.title}"
|
26
|
+
ap "description: [#{item.description ? item.description.encoding : ""}]#{item.description}"
|
27
|
+
ap "pubDate: #{item.pubDate}"
|
28
|
+
ap "category: [#{item.category ? item.category.encoding : ""}]#{item.category}"
|
29
|
+
ap "link: [#{item.link ? item.link.encoding : ""}]#{item.link}"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
desc "Retrieves a feed given a URL and prints raw body to the console"
|
33
|
+
task :feed_raw do
|
34
|
+
url = ENV["URL"] || raise("Missing URL variable")
|
35
|
+
ap open(url).read
|
36
|
+
end
|
37
|
+
end
|
data/lib/rssdump.rb
ADDED
metadata
ADDED
@@ -0,0 +1,165 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rssdump
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Damien Cram
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-05-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: logging
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: simple-rss
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: thread
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: mongo
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '2.1'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '2.1'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: awesome_print
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: rake
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :development
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
125
|
+
description: Retrieves all items from an RSS feed and stores them to a MongoDB collection.
|
126
|
+
Rssdump is based on simple-rss.
|
127
|
+
email:
|
128
|
+
- damien.cram@laposte.net
|
129
|
+
executables: []
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files: []
|
132
|
+
files:
|
133
|
+
- lib/rssdump.rb
|
134
|
+
- lib/rssdump/cleaning.rb
|
135
|
+
- lib/rssdump/dumper.rb
|
136
|
+
- lib/rssdump/item.rb
|
137
|
+
- lib/rssdump/mongo_store.rb
|
138
|
+
- lib/rssdump/scraper.rb
|
139
|
+
- lib/rssdump/tasks.rb
|
140
|
+
- lib/rssdump/version.rb
|
141
|
+
homepage: http://github.com/pompadour/rssdump.git
|
142
|
+
licenses: []
|
143
|
+
metadata: {}
|
144
|
+
post_install_message:
|
145
|
+
rdoc_options: []
|
146
|
+
require_paths:
|
147
|
+
- lib
|
148
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
153
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
154
|
+
requirements:
|
155
|
+
- - ">="
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0'
|
158
|
+
requirements: []
|
159
|
+
rubyforge_project:
|
160
|
+
rubygems_version: 2.5.1
|
161
|
+
signing_key:
|
162
|
+
specification_version: 4
|
163
|
+
summary: Retrieves all items from an RSS feed and stores them to a MongoDB collection.
|
164
|
+
Rssdump is based on simple-rss.
|
165
|
+
test_files: []
|