rssdump 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 95a5bf76f09fe0626531a62ffbabc791fd1ca4ac
4
+ data.tar.gz: 8ba6115c49373a2281dcb120fa29b334d28abbe9
5
+ SHA512:
6
+ metadata.gz: 1ffe505a06cf39cc23cd5dc40818163f32f00e75c1dc93d822f3fd797b198c6c96f1b3e06f0e6fb0d4295035a3a13d826ecf42f0844bcc26ca229123d8e1293e
7
+ data.tar.gz: 103b7df56f448c4e15c50b923b601716c15d98c54cc755b004723c96d0b8c4a7b8b4ff4d05816f51ea9e7041c61070b628f4df2e95ac31cd4a559c297eee2866
@@ -0,0 +1,36 @@
1
+ require 'nokogiri'
2
+ require 'time'
3
+
4
+ module Rssdump
5
+ module Cleaning
6
+ URLS = /https?:\/\/[\S]+/
7
+
8
+ def clean_link url
9
+ (url || "").strip
10
+ end
11
+
12
+ def clean_html txt
13
+ if txt
14
+ ensure_valid txt
15
+ c = txt ? Nokogiri::HTML(CGI.unescapeHTML(txt)).text : ""
16
+ c = c.gsub(URLS, "").split("LIRE AUSSI")[0] || ""
17
+ c = c.gsub(/[\n\r\t ]+/, " ").strip
18
+ c
19
+ else
20
+ ""
21
+ end
22
+ end
23
+
24
+ def parse_pub_date str
25
+ Time.rfc2822 str
26
+ end
27
+
28
+ def ensure_valid txt
29
+ if !txt.valid_encoding?
30
+ txt.force_encoding("ISO-8859-1").encode("UTF-8")
31
+ else
32
+ txt
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,69 @@
1
+ require 'thread/pool'
2
+
3
+ module Rssdump
4
+ class Dumper
5
+ def initialize opt
6
+ @mongo_url = opt[:mongo_url] || raise("Missing opt :mongo_url")
7
+ @store = MongoStore.new @mongo_url
8
+ @logger = Logging.logger[self]
9
+ end
10
+
11
+ def dump_feed feed_name, feed_url
12
+ begin
13
+ scraper = Scraper.new
14
+ @logger.info "Scraping feed #{feed_name} - #{feed_url}"
15
+ items = scraper.scrap feed_url, feed_name
16
+ nb = 0
17
+ for item in items
18
+ inserted = @store.upsert item
19
+ nb+=1 if inserted
20
+ end
21
+ if nb > 0
22
+ @logger.info "#{nb} new items inserted for feed #{feed_name} - #{feed_url}"
23
+ else
24
+ @logger.debug "No new item inserted for feed #{feed_name} - #{feed_url}"
25
+ end
26
+ if(nb == items.count)
27
+ @logger.warn "All #{nb} items were new for feed #{feed_name}. Consider increasing the scraping frequency"
28
+ end
29
+ {status: 0, nb_inserted: nb}
30
+ rescue => e
31
+ @logger.error "An error occurred while scraping feed #{feed_name} - #{feed_url}"
32
+ @logger.error "#{e}\n#{e.backtrace.join("\n")}"
33
+ {status: 1, error: e}
34
+ end
35
+ end
36
+
37
+ def dump file, opts = {threads: 1}
38
+ @logger.info "Starting a batch scrap from feed file #{file} [opts: #{opts}]"
39
+ pool = Thread.pool(opts[:threads])
40
+ mutex = Mutex.new
41
+
42
+ @start = Time.now
43
+ nb_inserted = 0
44
+ IO.readlines(file, :encoding => 'UTF-8').select do |l|
45
+ !l.strip.empty? && !l.strip.start_with?('#')
46
+ end.map do |l|
47
+ l.split("\t").map(&:strip)
48
+ end.each do |feed_name, feed_url|
49
+ pool.process do
50
+ res = dump_feed(feed_name, feed_url)
51
+ mutex.synchronize do
52
+ nb_inserted += res[:nb_inserted] if res[:status] == 0
53
+ end
54
+ end
55
+ end
56
+
57
+ # wait until all tasks are finished
58
+
59
+ @logger.info "Waiting for all pools to process."
60
+ pool.shutdown
61
+ @logger.info "All pools processed !"
62
+
63
+ @logger.info "Finished dumping file #{file} in #{"%.3f" % (Time.now - @start)} seconds."
64
+ @logger.info "Num of new items inserted #{nb_inserted}."
65
+ @logger.info "Total number of items in db: #{@store.nb_items}."
66
+ @logger.info "Data size : #{ '%.2f' % @store.disk_usage_mb}Mb."
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,6 @@
1
+
2
+ module Rssdump
3
+ class Item
4
+ attr_accessor :link, :title, :description, :category, :pub_date, :feed, :feed_name
5
+ end
6
+ end
@@ -0,0 +1,48 @@
1
+ require 'mongo'
2
+
3
+ module Rssdump
4
+ class MongoStore
5
+ DEFAULT_URL = "mongodb://127.0.0.1:27017/rssdump"
6
+ COLL_ITEMS = "items"
7
+
8
+ def initialize url
9
+ @url = url || DEFAULT_URL
10
+ client[COLL_ITEMS].indexes.create_one({ :link => 1 }, :unique => true)
11
+ client[COLL_ITEMS].indexes.create_one({ :pub_date => 1 }, :unique => false)
12
+ @logger = Logging.logger[self]
13
+ end
14
+
15
+ def upsert item
16
+ if client[COLL_ITEMS].find({link: item.link}).count == 0
17
+ @logger.debug "Inserting new item #{item.link} to store"
18
+ client[COLL_ITEMS].insert_one({
19
+ v: Rssdump::VERSION,
20
+ title: item.title,
21
+ link: item.link,
22
+ feed: item.feed,
23
+ feed_name: item.feed_name,
24
+ category: item.category,
25
+ description: item.description,
26
+ pub_date: item.pub_date,
27
+ })
28
+ true
29
+ else
30
+ false
31
+ end
32
+ end
33
+
34
+ def nb_items
35
+ client[COLL_ITEMS].count
36
+ end
37
+
38
+ def disk_usage_mb
39
+ client.database.command({dbStats: 1, scale: 1024**2}).first["dataSize"]
40
+ end
41
+
42
+ private
43
+
44
+ def client
45
+ @client ||= Mongo::Client.new(@url)
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,37 @@
1
+ # encoding: utf-8
2
+ require 'simple-rss'
3
+ require 'open-uri'
4
+
5
+ module Rssdump
6
+ class Scraper
7
+ include Logging
8
+ include Cleaning
9
+
10
+ attr_reader :errors
11
+
12
+ def scrap feed, feed_name = "_"
13
+ @errors = []
14
+ rss = SimpleRSS.parse ensure_valid(open(feed).read)
15
+ rss.items.map do |item|
16
+ begin
17
+ ritem = Item.new
18
+ ritem.title = clean_html(item.title)
19
+ ritem.category = clean_html(item.category)
20
+ ritem.description = clean_html(item.description)
21
+ ritem.pub_date = item.pubDate || item.updated
22
+ ritem.link = clean_link(item.link)
23
+ ritem.feed = feed
24
+ ritem.feed_name = feed_name
25
+ ritem
26
+ rescue => e
27
+ logger.error "An error occurred during cleaning with item #{item.link}."
28
+ logger.error "#{e}\n#{e.backtrace.join("\n")}"
29
+ logger.warn "Ignoring item #{item.link}."
30
+ nil
31
+ end
32
+ end.select do |item|
33
+ !item.nil?
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,37 @@
1
+ require_relative "../rssdump"
2
+ require 'awesome_print'
3
+
4
+ def obj_to_hash obj
5
+ obj.instance_variables.each_with_object({}) do |var, hash|
6
+ hash[var.to_s.delete("@")] = obj.instance_variable_get(var)
7
+ end
8
+ end
9
+
10
+ namespace :collect do
11
+ desc "Retrieves a feed given a URL and prints the parsed items to the console"
12
+ task :feed do
13
+ url = ENV["URL"] || raise("Missing URL variable")
14
+ items = Rssdump::Scraper.new.scrap url
15
+ items.each do |item|
16
+ ap obj_to_hash(item)
17
+ end
18
+ end
19
+ desc "Retrieves a feed given a URL, parses it using SimpleRSS and prints items to the console"
20
+ task :feed_simplerss do
21
+ url = ENV["URL"] || raise("Missing URL variable")
22
+ rss = SimpleRSS.parse open(url)
23
+ rss.items.map do |item|
24
+ ap '-'*80
25
+ ap "title: [#{item.title ? item.title.encoding : ""}]#{item.title}"
26
+ ap "description: [#{item.description ? item.description.encoding : ""}]#{item.description}"
27
+ ap "pubDate: #{item.pubDate}"
28
+ ap "category: [#{item.category ? item.category.encoding : ""}]#{item.category}"
29
+ ap "link: [#{item.link ? item.link.encoding : ""}]#{item.link}"
30
+ end
31
+ end
32
+ desc "Retrieves a feed given a URL and prints raw body to the console"
33
+ task :feed_raw do
34
+ url = ENV["URL"] || raise("Missing URL variable")
35
+ ap open(url).read
36
+ end
37
+ end
@@ -0,0 +1,3 @@
1
+ module Rssdump
2
+ VERSION = "0.1.0"
3
+ end
data/lib/rssdump.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "logging"
2
+
3
+ require_relative "./rssdump/cleaning"
4
+ require_relative "./rssdump/mongo_store"
5
+ require_relative "./rssdump/item"
6
+ require_relative "./rssdump/dumper"
7
+ require_relative "./rssdump/scraper"
8
+ require_relative "./rssdump/version"
metadata ADDED
@@ -0,0 +1,165 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rssdump
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Damien Cram
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-05-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: logging
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: simple-rss
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: thread
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: mongo
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '2.1'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '2.1'
83
+ - !ruby/object:Gem::Dependency
84
+ name: awesome_print
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rake
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: Retrieves all items from an RSS feed and stores them to a MongoDB collection.
126
+ Rssdump is based on simple-rss.
127
+ email:
128
+ - damien.cram@laposte.net
129
+ executables: []
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - lib/rssdump.rb
134
+ - lib/rssdump/cleaning.rb
135
+ - lib/rssdump/dumper.rb
136
+ - lib/rssdump/item.rb
137
+ - lib/rssdump/mongo_store.rb
138
+ - lib/rssdump/scraper.rb
139
+ - lib/rssdump/tasks.rb
140
+ - lib/rssdump/version.rb
141
+ homepage: http://github.com/pompadour/rssdump.git
142
+ licenses: []
143
+ metadata: {}
144
+ post_install_message:
145
+ rdoc_options: []
146
+ require_paths:
147
+ - lib
148
+ required_ruby_version: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ required_rubygems_version: !ruby/object:Gem::Requirement
154
+ requirements:
155
+ - - ">="
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
158
+ requirements: []
159
+ rubyforge_project:
160
+ rubygems_version: 2.5.1
161
+ signing_key:
162
+ specification_version: 4
163
+ summary: Retrieves all items from an RSS feed and stores them to a MongoDB collection.
164
+ Rssdump is based on simple-rss.
165
+ test_files: []