rssdump 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 95a5bf76f09fe0626531a62ffbabc791fd1ca4ac
4
+ data.tar.gz: 8ba6115c49373a2281dcb120fa29b334d28abbe9
5
+ SHA512:
6
+ metadata.gz: 1ffe505a06cf39cc23cd5dc40818163f32f00e75c1dc93d822f3fd797b198c6c96f1b3e06f0e6fb0d4295035a3a13d826ecf42f0844bcc26ca229123d8e1293e
7
+ data.tar.gz: 103b7df56f448c4e15c50b923b601716c15d98c54cc755b004723c96d0b8c4a7b8b4ff4d05816f51ea9e7041c61070b628f4df2e95ac31cd4a559c297eee2866
@@ -0,0 +1,36 @@
1
+ require 'nokogiri'
2
+ require 'time'
3
+
4
+ module Rssdump
5
+ module Cleaning
6
+ URLS = /https?:\/\/[\S]+/
7
+
8
+ def clean_link url
9
+ (url || "").strip
10
+ end
11
+
12
+ def clean_html txt
13
+ if txt
14
+ ensure_valid txt
15
+ c = txt ? Nokogiri::HTML(CGI.unescapeHTML(txt)).text : ""
16
+ c = c.gsub(URLS, "").split("LIRE AUSSI")[0] || ""
17
+ c = c.gsub(/[\n\r\t ]+/, " ").strip
18
+ c
19
+ else
20
+ ""
21
+ end
22
+ end
23
+
24
+ def parse_pub_date str
25
+ Time.rfc2822 str
26
+ end
27
+
28
+ def ensure_valid txt
29
+ if !txt.valid_encoding?
30
+ txt.force_encoding("ISO-8859-1").encode("UTF-8")
31
+ else
32
+ txt
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,69 @@
1
+ require 'thread/pool'
2
+
3
+ module Rssdump
4
+ class Dumper
5
+ def initialize opt
6
+ @mongo_url = opt[:mongo_url] || raise("Missing opt :mongo_url")
7
+ @store = MongoStore.new @mongo_url
8
+ @logger = Logging.logger[self]
9
+ end
10
+
11
+ def dump_feed feed_name, feed_url
12
+ begin
13
+ scraper = Scraper.new
14
+ @logger.info "Scraping feed #{feed_name} - #{feed_url}"
15
+ items = scraper.scrap feed_url, feed_name
16
+ nb = 0
17
+ for item in items
18
+ inserted = @store.upsert item
19
+ nb+=1 if inserted
20
+ end
21
+ if nb > 0
22
+ @logger.info "#{nb} new items inserted for feed #{feed_name} - #{feed_url}"
23
+ else
24
+ @logger.debug "No new item inserted for feed #{feed_name} - #{feed_url}"
25
+ end
26
+ if(nb == items.count)
27
+ @logger.warn "All #{nb} items were new for feed #{feed_name}. Consider increasing the scraping frequency"
28
+ end
29
+ {status: 0, nb_inserted: nb}
30
+ rescue => e
31
+ @logger.error "An error occurred while scraping feed #{feed_name} - #{feed_url}"
32
+ @logger.error "#{e}\n#{e.backtrace.join("\n")}"
33
+ {status: 1, error: e}
34
+ end
35
+ end
36
+
37
+ def dump file, opts = {threads: 1}
38
+ @logger.info "Starting a batch scrap from feed file #{file} [opts: #{opts}]"
39
+ pool = Thread.pool(opts[:threads])
40
+ mutex = Mutex.new
41
+
42
+ @start = Time.now
43
+ nb_inserted = 0
44
+ IO.readlines(file, :encoding => 'UTF-8').select do |l|
45
+ !l.strip.empty? && !l.strip.start_with?('#')
46
+ end.map do |l|
47
+ l.split("\t").map(&:strip)
48
+ end.each do |feed_name, feed_url|
49
+ pool.process do
50
+ res = dump_feed(feed_name, feed_url)
51
+ mutex.synchronize do
52
+ nb_inserted += res[:nb_inserted] if res[:status] == 0
53
+ end
54
+ end
55
+ end
56
+
57
+ # wait until all tasks are finished
58
+
59
+ @logger.info "Waiting for all pools to process."
60
+ pool.shutdown
61
+ @logger.info "All pools processed !"
62
+
63
+ @logger.info "Finished dumping file #{file} in #{"%.3f" % (Time.now - @start)} seconds."
64
+ @logger.info "Num of new items inserted #{nb_inserted}."
65
+ @logger.info "Total number of items in db: #{@store.nb_items}."
66
+ @logger.info "Data size : #{ '%.2f' % @store.disk_usage_mb}Mb."
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,6 @@
1
+
2
+ module Rssdump
3
+ class Item
4
+ attr_accessor :link, :title, :description, :category, :pub_date, :feed, :feed_name
5
+ end
6
+ end
@@ -0,0 +1,48 @@
1
+ require 'mongo'
2
+
3
+ module Rssdump
4
+ class MongoStore
5
+ DEFAULT_URL = "mongodb://127.0.0.1:27017/rssdump"
6
+ COLL_ITEMS = "items"
7
+
8
+ def initialize url
9
+ @url = url || DEFAULT_URL
10
+ client[COLL_ITEMS].indexes.create_one({ :link => 1 }, :unique => true)
11
+ client[COLL_ITEMS].indexes.create_one({ :pub_date => 1 }, :unique => false)
12
+ @logger = Logging.logger[self]
13
+ end
14
+
15
+ def upsert item
16
+ if client[COLL_ITEMS].find({link: item.link}).count == 0
17
+ @logger.debug "Inserting new item #{item.link} to store"
18
+ client[COLL_ITEMS].insert_one({
19
+ v: Rssdump::VERSION,
20
+ title: item.title,
21
+ link: item.link,
22
+ feed: item.feed,
23
+ feed_name: item.feed_name,
24
+ category: item.category,
25
+ description: item.description,
26
+ pub_date: item.pub_date,
27
+ })
28
+ true
29
+ else
30
+ false
31
+ end
32
+ end
33
+
34
+ def nb_items
35
+ client[COLL_ITEMS].count
36
+ end
37
+
38
+ def disk_usage_mb
39
+ client.database.command({dbStats: 1, scale: 1024**2}).first["dataSize"]
40
+ end
41
+
42
+ private
43
+
44
+ def client
45
+ @client ||= Mongo::Client.new(@url)
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,37 @@
1
+ # encoding: utf-8
2
+ require 'simple-rss'
3
+ require 'open-uri'
4
+
5
+ module Rssdump
6
+ class Scraper
7
+ include Logging
8
+ include Cleaning
9
+
10
+ attr_reader :errors
11
+
12
+ def scrap feed, feed_name = "_"
13
+ @errors = []
14
+ rss = SimpleRSS.parse ensure_valid(open(feed).read)
15
+ rss.items.map do |item|
16
+ begin
17
+ ritem = Item.new
18
+ ritem.title = clean_html(item.title)
19
+ ritem.category = clean_html(item.category)
20
+ ritem.description = clean_html(item.description)
21
+ ritem.pub_date = item.pubDate || item.updated
22
+ ritem.link = clean_link(item.link)
23
+ ritem.feed = feed
24
+ ritem.feed_name = feed_name
25
+ ritem
26
+ rescue => e
27
+ logger.error "An error occurred during cleaning with item #{item.link}."
28
+ logger.error "#{e}\n#{e.backtrace.join("\n")}"
29
+ logger.warn "Ignoring item #{item.link}."
30
+ nil
31
+ end
32
+ end.select do |item|
33
+ !item.nil?
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ require_relative "../rssdump"
2
+ require 'awesome_print'
3
+
4
+ def obj_to_hash obj
5
+ obj.instance_variables.each_with_object({}) do |var, hash|
6
+ hash[var.to_s.delete("@")] = obj.instance_variable_get(var)
7
+ end
8
+ end
9
+
10
+ namespace :collect do
11
+ desc "Retrieves a feed given a URL and prints the parsed items to the console"
12
+ task :feed do
13
+ url = ENV["URL"] || raise("Missing URL variable")
14
+ items = Rssdump::Scraper.new.scrap url
15
+ items.each do |item|
16
+ ap obj_to_hash(item)
17
+ end
18
+ end
19
+ desc "Retrieves a feed given a URL, parses it using SimpleRSS and prints items to the console"
20
+ task :feed_simplerss do
21
+ url = ENV["URL"] || raise("Missing URL variable")
22
+ rss = SimpleRSS.parse open(url)
23
+ rss.items.map do |item|
24
+ ap '-'*80
25
+ ap "title: [#{item.title ? item.title.encoding : ""}]#{item.title}"
26
+ ap "description: [#{item.description ? item.description.encoding : ""}]#{item.description}"
27
+ ap "pubDate: #{item.pubDate}"
28
+ ap "category: [#{item.category ? item.category.encoding : ""}]#{item.category}"
29
+ ap "link: [#{item.link ? item.link.encoding : ""}]#{item.link}"
30
+ end
31
+ end
32
+ desc "Retrieves a feed given a URL and prints raw body to the console"
33
+ task :feed_raw do
34
+ url = ENV["URL"] || raise("Missing URL variable")
35
+ ap open(url).read
36
+ end
37
+ end
@@ -0,0 +1,3 @@
1
+ module Rssdump
2
+ VERSION = "0.1.0"
3
+ end
data/lib/rssdump.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "logging"
2
+
3
+ require_relative "./rssdump/cleaning"
4
+ require_relative "./rssdump/mongo_store"
5
+ require_relative "./rssdump/item"
6
+ require_relative "./rssdump/dumper"
7
+ require_relative "./rssdump/scraper"
8
+ require_relative "./rssdump/version"
metadata ADDED
@@ -0,0 +1,165 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rssdump
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Damien Cram
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-05-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: logging
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: simple-rss
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: thread
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: mongo
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '2.1'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '2.1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: awesome_print
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rake
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: Retrieves all items from an RSS feed and stores them to a MongoDB collection.
126
+ Rssdump is based on simple-rss.
127
+ email:
128
+ - damien.cram@laposte.net
129
+ executables: []
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - lib/rssdump.rb
134
+ - lib/rssdump/cleaning.rb
135
+ - lib/rssdump/dumper.rb
136
+ - lib/rssdump/item.rb
137
+ - lib/rssdump/mongo_store.rb
138
+ - lib/rssdump/scraper.rb
139
+ - lib/rssdump/tasks.rb
140
+ - lib/rssdump/version.rb
141
+ homepage: http://github.com/pompadour/rssdump.git
142
+ licenses: []
143
+ metadata: {}
144
+ post_install_message:
145
+ rdoc_options: []
146
+ require_paths:
147
+ - lib
148
+ required_ruby_version: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ required_rubygems_version: !ruby/object:Gem::Requirement
154
+ requirements:
155
+ - - ">="
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
158
+ requirements: []
159
+ rubyforge_project:
160
+ rubygems_version: 2.5.1
161
+ signing_key:
162
+ specification_version: 4
163
+ summary: Retrieves all items from an RSS feed and stores them to a MongoDB collection.
164
+ Rssdump is based on simple-rss.
165
+ test_files: []