rssdump 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4a064db5574bf273163d1194ea130f14f9124f48
4
- data.tar.gz: 3aec73daf1261d65a697fbfdf113d33c22d21a33
3
+ metadata.gz: b7d6af91750470ce86258f4894c30b203321a327
4
+ data.tar.gz: a4ee133486a968fef2529416135e3c41269bfe59
5
5
  SHA512:
6
- metadata.gz: 0b27cb123ebd759ff853f35d4ae763a056fc2696750284039c862bea6de4a5632871d822718c98dc666cedca93de069a18678881206dba410ec43e6cd5579bc2
7
- data.tar.gz: c54ab35de1b51eaad4bd3baf42507f83a52cd7ba68b7af77e9c5eea13f4766c706223afcbafcb392ae0a34f5aef063d2225ae9e41456ddb6de62736a029e4975
6
+ metadata.gz: 04fd37d7ac11f96b795e4ded5517e85cae6759cabdd99fe4e75156de740da3ce67b483f5a928914a159c23835ad711cbad3ceaad72cbe4221987bcff3b8edd06
7
+ data.tar.gz: 0301743aefbe3176381d1b2f79053d5f7fc3389eb74fa40a11384e10ca87c1e932fa0a3f9f03c63b38889aa71c480065c8339b960cc71ae0b9b922b081cd53dc
data/lib/rssdump.rb CHANGED
@@ -2,6 +2,5 @@ require "logging"
2
2
 
3
3
  require_relative "./rssdump/cleaning"
4
4
  require_relative "./rssdump/item"
5
- require_relative "./rssdump/dumper"
6
5
  require_relative "./rssdump/scraper"
7
6
  require_relative "./rssdump/version"
@@ -7,8 +7,13 @@ module Rssdump
7
7
  include Logging
8
8
  include Cleaning
9
9
 
10
- def scrap feed_url
11
- rss = SimpleRSS.parse ensure_valid(open(feed_url).read)
10
+ def scrap feed
11
+ scrap_from_body(open(feed).read, feed)
12
+ end
13
+
14
+ def scrap_from_body body, feed_url
15
+ @errors = []
16
+ rss = SimpleRSS.parse ensure_valid(body)
12
17
  status = :ok
13
18
  errors = []
14
19
  items = rss.items.map do |item|
@@ -1,3 +1,3 @@
1
1
  module Rssdump
2
- VERSION = "1.0.0"
2
+ VERSION = "1.1.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rssdump
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Cram
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-06 00:00:00.000000000 Z
11
+ date: 2016-07-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -117,7 +117,6 @@ extra_rdoc_files: []
117
117
  files:
118
118
  - lib/rssdump.rb
119
119
  - lib/rssdump/cleaning.rb
120
- - lib/rssdump/dumper.rb
121
120
  - lib/rssdump/item.rb
122
121
  - lib/rssdump/scraper.rb
123
122
  - lib/rssdump/tasks.rb
@@ -1,69 +0,0 @@
1
- require 'thread/pool'
2
-
3
- module Rssdump
4
- class Dumper
5
- def initialize opt
6
- @mongo_url = opt[:mongo_url] || raise("Missing opt :mongo_url")
7
- @store = MongoStore.new @mongo_url
8
- @logger = Logging.logger[self]
9
- end
10
-
11
- def dump_feed feed_name, feed_url
12
- begin
13
- scraper = Scraper.new
14
- @logger.info "Scraping feed #{feed_name} - #{feed_url}"
15
- items = scraper.scrap feed_url, feed_name
16
- nb = 0
17
- for item in items
18
- inserted = @store.upsert item
19
- nb+=1 if inserted
20
- end
21
- if nb > 0
22
- @logger.info "#{nb} new items inserted for feed #{feed_name} - #{feed_url}"
23
- else
24
- @logger.debug "No new item inserted for feed #{feed_name} - #{feed_url}"
25
- end
26
- if(nb == items.count)
27
- @logger.warn "All #{nb} items were new for feed #{feed_name}. Consider increasing the scraping frequency"
28
- end
29
- {status: 0, nb_inserted: nb}
30
- rescue => e
31
- @logger.error "An error occurred while scraping feed #{feed_name} - #{feed_url}"
32
- @logger.error "#{e}\n#{e.backtrace.join("\n")}"
33
- {status: 1, error: e}
34
- end
35
- end
36
-
37
- def dump file, opts = {threads: 1}
38
- @logger.info "Starting a batch scrap from feed file #{file} [opts: #{opts}]"
39
- pool = Thread.pool(opts[:threads])
40
- mutex = Mutex.new
41
-
42
- @start = Time.now
43
- nb_inserted = 0
44
- IO.readlines(file, :encoding => 'UTF-8').select do |l|
45
- !l.strip.empty? && !l.strip.start_with?('#')
46
- end.map do |l|
47
- l.split("\t").map(&:strip)
48
- end.each do |feed_name, feed_url|
49
- pool.process do
50
- res = dump_feed(feed_name, feed_url)
51
- mutex.synchronize do
52
- nb_inserted += res[:nb_inserted] if res[:status] == 0
53
- end
54
- end
55
- end
56
-
57
- # wait until all tasks are finished
58
-
59
- @logger.info "Waiting for all pools to process."
60
- pool.shutdown
61
- @logger.info "All pools processed !"
62
-
63
- @logger.info "Finished dumping file #{file} in #{"%.3f" % (Time.now - @start)} seconds."
64
- @logger.info "Num of new items inserted #{nb_inserted}."
65
- @logger.info "Total number of items in db: #{@store.nb_items}."
66
- @logger.info "Data size : #{ '%.2f' % @store.disk_usage_mb}Mb."
67
- end
68
- end
69
- end