title_grabber 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6a541f189b34294558e51d72b88dc447e55e7b1d2fb802463ccd9bf27c2b4e19
4
- data.tar.gz: a1ab49973e029e4bdcbcae37fefbafe6ae2e5042dc653aa3b3e7dbff42c2d804
3
+ metadata.gz: f341d404fbe5f16ef1eb904c75fbdd9331a76de7cb08c969d59526dbae9d95d0
4
+ data.tar.gz: 586a09fdca43a48db702b69e4e6e5d123e123d5aa6631a8df8c7b9eb46dd8820
5
5
  SHA512:
6
- metadata.gz: f6c4ecdb7a780af13b926146a76d337fd433def78e4326ee7e674fb8617a3902d5942ef61c3a14c3d7a3522920daf7a66c2fdc2e85a402c81f076a2589692eab
7
- data.tar.gz: 2d921af22310ede97c1035c858abb1c87da7125df09587124b1a6d36a1aab6fa0cc745d560a83cce74b9aca2a63ae96244c74a05429cd124ec6342fdb64598ac
6
+ metadata.gz: 14ceff1f56aea4cd1b9185b46e99af361ae243f809d50571933a96de2a757aa0589b4ecda34dfdeb9ff647c42412fed95acd1a1ee03fb875ed05229f53a71505
7
+ data.tar.gz: fc2924e719dab7d72b8adf119fe11d29909433a84732a19b878a2faf437dfb5e4e595c696e63d525864133499816bb9722c76af375dea8000851e455908705ae
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.3.0)
4
+ title_grabber (0.3.1)
5
5
  http (~> 4.1)
6
6
  oga (~> 2.15)
7
7
 
data/lib/http_helper.rb CHANGED
@@ -1,5 +1,3 @@
1
- require "logger"
2
-
3
1
  require "http"
4
2
 
5
3
  require_relative "text_helper"
@@ -37,7 +35,7 @@ module HTTPHelper
37
35
 
38
36
  if retries <= MAX_RETRIES
39
37
  rest_time = rand(REST_INTERVAL)
40
- logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time} secs - Retry ##{retries}"
38
+ logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
41
39
  sleep(rest_time)
42
40
  retry
43
41
  else
data/lib/text_helper.rb CHANGED
@@ -2,11 +2,16 @@ module TextHelper
2
2
  SINGLE_SPACE = -" "
3
3
 
4
4
  def utf8_encode(text = nil)
5
+ txt = String(text)
6
+
5
7
  begin
6
- String(text).encode(-"UTF-8", invalid: :replace, undef: :replace,
8
+ txt.encode!(-"UTF-8", invalid: :replace, undef: :replace,
7
9
  replace: -"")
8
10
  rescue EncodingError
9
11
  -""
12
+ else
13
+ txt.delete!(-"\u0000") # get rid of nasty null bytes
14
+ txt
10
15
  end
11
16
  end
12
17
 
@@ -15,7 +20,6 @@ module TextHelper
15
20
  text.strip!
16
21
  text.gsub!("\n", SINGLE_SPACE)
17
22
  text.gsub(/\s{2,}/, SINGLE_SPACE)
18
- text.delete!(-"\u0000") # get rid of nasty null bytes
19
23
  text
20
24
  end
21
25
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.3.0"
2
+ VERSION = "0.3.1"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  require "csv"
2
2
  require "etc"
3
3
  require "fileutils"
4
+ require "logger"
5
+ require "pathname"
4
6
 
5
7
  require "bundler/setup"
6
8
  require "oga"
@@ -24,16 +26,27 @@ module TitleGrabber
24
26
  include HTTPHelper
25
27
  include TextHelper
26
28
 
27
- attr_reader :lines, :out_path
29
+ attr_reader :lines, :out_path, :tmp_path, :logger
28
30
 
29
31
  def initialize(lines, options)
30
32
  @lines = lines
31
33
  @out_path = options[:output]
34
+ @tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
35
+
36
+ logging_target = if ENV["DEBUG"]
37
+ STDOUT
38
+ else
39
+ log_file = Pathname(__FILE__).sub_ext(".log").
40
+ basename.
41
+ open("w")
42
+ end
43
+ @logger = Logger.new(logging_target)
32
44
  end
33
45
 
34
46
  def call
47
+ install_at_exit_handler
48
+
35
49
  queue = Queue.new
36
- tmp_path = out_path.sub_ext(".tmp#{out_path.extname}")
37
50
  CSV.open(tmp_path, "w", force_quotes: true) do |csv|
38
51
  csv << HEADERS
39
52
 
@@ -62,10 +75,10 @@ module TitleGrabber
62
75
  rescue ThreadError; end
63
76
 
64
77
  while url
65
- if html = open_w_timeout(url)
78
+ if (html = open_w_timeout(url)) && !html.empty?
66
79
  doc = begin
67
80
  Oga.parse_html(html)
68
- rescue LL::ParserError => err
81
+ rescue ArgumentError, LL::ParserError => err
69
82
  logger.error "[Thread: ##{Thread.current[:id]}] Unable to parse HTML from URL '#{url}' - #{err.message}"
70
83
  nil
71
84
  end
@@ -88,12 +101,20 @@ module TitleGrabber
88
101
  end
89
102
  }.each(&:join)
90
103
  end
91
-
92
- FileUtils.mv(tmp_path, out_path)
93
104
  end
94
105
 
95
106
  private
96
107
 
108
+ def install_at_exit_handler
109
+ at_exit do
110
+ if tmp_path.size?
111
+ FileUtils.mv(tmp_path, out_path)
112
+ else
113
+ tmp_path.unlink if tmp_path.exist?
114
+ end
115
+ end
116
+ end
117
+
97
118
  def processed_urls
98
119
  @processed_urls ||= begin
99
120
  urls = {}
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch