rssdump 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4a064db5574bf273163d1194ea130f14f9124f48
4
- data.tar.gz: 3aec73daf1261d65a697fbfdf113d33c22d21a33
3
+ metadata.gz: b7d6af91750470ce86258f4894c30b203321a327
4
+ data.tar.gz: a4ee133486a968fef2529416135e3c41269bfe59
5
5
  SHA512:
6
- metadata.gz: 0b27cb123ebd759ff853f35d4ae763a056fc2696750284039c862bea6de4a5632871d822718c98dc666cedca93de069a18678881206dba410ec43e6cd5579bc2
7
- data.tar.gz: c54ab35de1b51eaad4bd3baf42507f83a52cd7ba68b7af77e9c5eea13f4766c706223afcbafcb392ae0a34f5aef063d2225ae9e41456ddb6de62736a029e4975
6
+ metadata.gz: 04fd37d7ac11f96b795e4ded5517e85cae6759cabdd99fe4e75156de740da3ce67b483f5a928914a159c23835ad711cbad3ceaad72cbe4221987bcff3b8edd06
7
+ data.tar.gz: 0301743aefbe3176381d1b2f79053d5f7fc3389eb74fa40a11384e10ca87c1e932fa0a3f9f03c63b38889aa71c480065c8339b960cc71ae0b9b922b081cd53dc
data/lib/rssdump.rb CHANGED
@@ -2,6 +2,5 @@ require "logging"
2
2
 
3
3
  require_relative "./rssdump/cleaning"
4
4
  require_relative "./rssdump/item"
5
- require_relative "./rssdump/dumper"
6
5
  require_relative "./rssdump/scraper"
7
6
  require_relative "./rssdump/version"
@@ -7,8 +7,13 @@ module Rssdump
7
7
  include Logging
8
8
  include Cleaning
9
9
 
10
- def scrap feed_url
11
- rss = SimpleRSS.parse ensure_valid(open(feed_url).read)
10
+ def scrap feed
11
+ scrap_from_body(open(feed).read, feed)
12
+ end
13
+
14
+ def scrap_from_body body, feed_url
15
+ @errors = []
16
+ rss = SimpleRSS.parse ensure_valid(body)
12
17
  status = :ok
13
18
  errors = []
14
19
  items = rss.items.map do |item|
@@ -1,3 +1,3 @@
1
1
  module Rssdump
2
- VERSION = "1.0.0"
2
+ VERSION = "1.1.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rssdump
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Cram
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-06 00:00:00.000000000 Z
11
+ date: 2016-07-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -117,7 +117,6 @@ extra_rdoc_files: []
117
117
  files:
118
118
  - lib/rssdump.rb
119
119
  - lib/rssdump/cleaning.rb
120
- - lib/rssdump/dumper.rb
121
120
  - lib/rssdump/item.rb
122
121
  - lib/rssdump/scraper.rb
123
122
  - lib/rssdump/tasks.rb
@@ -1,69 +0,0 @@
1
- require 'thread/pool'
2
-
3
- module Rssdump
4
- class Dumper
5
- def initialize opt
6
- @mongo_url = opt[:mongo_url] || raise("Missing opt :mongo_url")
7
- @store = MongoStore.new @mongo_url
8
- @logger = Logging.logger[self]
9
- end
10
-
11
- def dump_feed feed_name, feed_url
12
- begin
13
- scraper = Scraper.new
14
- @logger.info "Scraping feed #{feed_name} - #{feed_url}"
15
- items = scraper.scrap feed_url, feed_name
16
- nb = 0
17
- for item in items
18
- inserted = @store.upsert item
19
- nb+=1 if inserted
20
- end
21
- if nb > 0
22
- @logger.info "#{nb} new items inserted for feed #{feed_name} - #{feed_url}"
23
- else
24
- @logger.debug "No new item inserted for feed #{feed_name} - #{feed_url}"
25
- end
26
- if(nb == items.count)
27
- @logger.warn "All #{nb} items were new for feed #{feed_name}. Consider increasing the scraping frequency"
28
- end
29
- {status: 0, nb_inserted: nb}
30
- rescue => e
31
- @logger.error "An error occurred while scraping feed #{feed_name} - #{feed_url}"
32
- @logger.error "#{e}\n#{e.backtrace.join("\n")}"
33
- {status: 1, error: e}
34
- end
35
- end
36
-
37
- def dump file, opts = {threads: 1}
38
- @logger.info "Starting a batch scrap from feed file #{file} [opts: #{opts}]"
39
- pool = Thread.pool(opts[:threads])
40
- mutex = Mutex.new
41
-
42
- @start = Time.now
43
- nb_inserted = 0
44
- IO.readlines(file, :encoding => 'UTF-8').select do |l|
45
- !l.strip.empty? && !l.strip.start_with?('#')
46
- end.map do |l|
47
- l.split("\t").map(&:strip)
48
- end.each do |feed_name, feed_url|
49
- pool.process do
50
- res = dump_feed(feed_name, feed_url)
51
- mutex.synchronize do
52
- nb_inserted += res[:nb_inserted] if res[:status] == 0
53
- end
54
- end
55
- end
56
-
57
- # wait until all tasks are finished
58
-
59
- @logger.info "Waiting for all pools to process."
60
- pool.shutdown
61
- @logger.info "All pools processed !"
62
-
63
- @logger.info "Finished dumping file #{file} in #{"%.3f" % (Time.now - @start)} seconds."
64
- @logger.info "Num of new items inserted #{nb_inserted}."
65
- @logger.info "Total number of items in db: #{@store.nb_items}."
66
- @logger.info "Data size : #{ '%.2f' % @store.disk_usage_mb}Mb."
67
- end
68
- end
69
- end