rssdump 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rssdump.rb +0 -1
- data/lib/rssdump/scraper.rb +7 -2
- data/lib/rssdump/version.rb +1 -1
- metadata +2 -3
- data/lib/rssdump/dumper.rb +0 -69
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b7d6af91750470ce86258f4894c30b203321a327
|
4
|
+
data.tar.gz: a4ee133486a968fef2529416135e3c41269bfe59
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04fd37d7ac11f96b795e4ded5517e85cae6759cabdd99fe4e75156de740da3ce67b483f5a928914a159c23835ad711cbad3ceaad72cbe4221987bcff3b8edd06
|
7
|
+
data.tar.gz: 0301743aefbe3176381d1b2f79053d5f7fc3389eb74fa40a11384e10ca87c1e932fa0a3f9f03c63b38889aa71c480065c8339b960cc71ae0b9b922b081cd53dc
|
data/lib/rssdump.rb
CHANGED
data/lib/rssdump/scraper.rb
CHANGED
@@ -7,8 +7,13 @@ module Rssdump
|
|
7
7
|
include Logging
|
8
8
|
include Cleaning
|
9
9
|
|
10
|
-
def scrap
|
11
|
-
|
10
|
+
def scrap feed
|
11
|
+
scrap_from_body(open(feed).read, feed)
|
12
|
+
end
|
13
|
+
|
14
|
+
def scrap_from_body body, feed_url
|
15
|
+
@errors = []
|
16
|
+
rss = SimpleRSS.parse ensure_valid(body)
|
12
17
|
status = :ok
|
13
18
|
errors = []
|
14
19
|
items = rss.items.map do |item|
|
data/lib/rssdump/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rssdump
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Damien Cram
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-07-
|
11
|
+
date: 2016-07-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -117,7 +117,6 @@ extra_rdoc_files: []
|
|
117
117
|
files:
|
118
118
|
- lib/rssdump.rb
|
119
119
|
- lib/rssdump/cleaning.rb
|
120
|
-
- lib/rssdump/dumper.rb
|
121
120
|
- lib/rssdump/item.rb
|
122
121
|
- lib/rssdump/scraper.rb
|
123
122
|
- lib/rssdump/tasks.rb
|
data/lib/rssdump/dumper.rb
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
require 'thread/pool'
|
2
|
-
|
3
|
-
module Rssdump
|
4
|
-
class Dumper
|
5
|
-
def initialize opt
|
6
|
-
@mongo_url = opt[:mongo_url] || raise("Missing opt :mongo_url")
|
7
|
-
@store = MongoStore.new @mongo_url
|
8
|
-
@logger = Logging.logger[self]
|
9
|
-
end
|
10
|
-
|
11
|
-
def dump_feed feed_name, feed_url
|
12
|
-
begin
|
13
|
-
scraper = Scraper.new
|
14
|
-
@logger.info "Scraping feed #{feed_name} - #{feed_url}"
|
15
|
-
items = scraper.scrap feed_url, feed_name
|
16
|
-
nb = 0
|
17
|
-
for item in items
|
18
|
-
inserted = @store.upsert item
|
19
|
-
nb+=1 if inserted
|
20
|
-
end
|
21
|
-
if nb > 0
|
22
|
-
@logger.info "#{nb} new items inserted for feed #{feed_name} - #{feed_url}"
|
23
|
-
else
|
24
|
-
@logger.debug "No new item inserted for feed #{feed_name} - #{feed_url}"
|
25
|
-
end
|
26
|
-
if(nb == items.count)
|
27
|
-
@logger.warn "All #{nb} items were new for feed #{feed_name}. Consider increasing the scraping frequency"
|
28
|
-
end
|
29
|
-
{status: 0, nb_inserted: nb}
|
30
|
-
rescue => e
|
31
|
-
@logger.error "An error occurred while scraping feed #{feed_name} - #{feed_url}"
|
32
|
-
@logger.error "#{e}\n#{e.backtrace.join("\n")}"
|
33
|
-
{status: 1, error: e}
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
def dump file, opts = {threads: 1}
|
38
|
-
@logger.info "Starting a batch scrap from feed file #{file} [opts: #{opts}]"
|
39
|
-
pool = Thread.pool(opts[:threads])
|
40
|
-
mutex = Mutex.new
|
41
|
-
|
42
|
-
@start = Time.now
|
43
|
-
nb_inserted = 0
|
44
|
-
IO.readlines(file, :encoding => 'UTF-8').select do |l|
|
45
|
-
!l.strip.empty? && !l.strip.start_with?('#')
|
46
|
-
end.map do |l|
|
47
|
-
l.split("\t").map(&:strip)
|
48
|
-
end.each do |feed_name, feed_url|
|
49
|
-
pool.process do
|
50
|
-
res = dump_feed(feed_name, feed_url)
|
51
|
-
mutex.synchronize do
|
52
|
-
nb_inserted += res[:nb_inserted] if res[:status] == 0
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# wait until all tasks are finished
|
58
|
-
|
59
|
-
@logger.info "Waiting for all pools to process."
|
60
|
-
pool.shutdown
|
61
|
-
@logger.info "All pools processed !"
|
62
|
-
|
63
|
-
@logger.info "Finished dumping file #{file} in #{"%.3f" % (Time.now - @start)} seconds."
|
64
|
-
@logger.info "Num of new items inserted #{nb_inserted}."
|
65
|
-
@logger.info "Total number of items in db: #{@store.nb_items}."
|
66
|
-
@logger.info "Data size : #{ '%.2f' % @store.disk_usage_mb}Mb."
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|