title_grabber 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6a541f189b34294558e51d72b88dc447e55e7b1d2fb802463ccd9bf27c2b4e19
4
- data.tar.gz: a1ab49973e029e4bdcbcae37fefbafe6ae2e5042dc653aa3b3e7dbff42c2d804
3
+ metadata.gz: f341d404fbe5f16ef1eb904c75fbdd9331a76de7cb08c969d59526dbae9d95d0
4
+ data.tar.gz: 586a09fdca43a48db702b69e4e6e5d123e123d5aa6631a8df8c7b9eb46dd8820
5
5
  SHA512:
6
- metadata.gz: f6c4ecdb7a780af13b926146a76d337fd433def78e4326ee7e674fb8617a3902d5942ef61c3a14c3d7a3522920daf7a66c2fdc2e85a402c81f076a2589692eab
7
- data.tar.gz: 2d921af22310ede97c1035c858abb1c87da7125df09587124b1a6d36a1aab6fa0cc745d560a83cce74b9aca2a63ae96244c74a05429cd124ec6342fdb64598ac
6
+ metadata.gz: 14ceff1f56aea4cd1b9185b46e99af361ae243f809d50571933a96de2a757aa0589b4ecda34dfdeb9ff647c42412fed95acd1a1ee03fb875ed05229f53a71505
7
+ data.tar.gz: fc2924e719dab7d72b8adf119fe11d29909433a84732a19b878a2faf437dfb5e4e595c696e63d525864133499816bb9722c76af375dea8000851e455908705ae
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.3.0)
4
+ title_grabber (0.3.1)
5
5
  http (~> 4.1)
6
6
  oga (~> 2.15)
7
7
 
data/lib/http_helper.rb CHANGED
@@ -1,5 +1,3 @@
1
- require "logger"
2
-
3
1
  require "http"
4
2
 
5
3
  require_relative "text_helper"
@@ -37,7 +35,7 @@ module HTTPHelper
37
35
 
38
36
  if retries <= MAX_RETRIES
39
37
  rest_time = rand(REST_INTERVAL)
40
- logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time} secs - Retry ##{retries}"
38
+ logger.warn "[Thread: ##{Thread.current[:id]}] URL: #{url} [#{msg}] - Going to sleep for #{rest_time.round(1)} secs - Retry ##{retries}"
41
39
  sleep(rest_time)
42
40
  retry
43
41
  else
data/lib/text_helper.rb CHANGED
@@ -2,11 +2,16 @@ module TextHelper
2
2
  SINGLE_SPACE = -" "
3
3
 
4
4
  def utf8_encode(text = nil)
5
+ txt = String(text)
6
+
5
7
  begin
6
- String(text).encode(-"UTF-8", invalid: :replace, undef: :replace,
8
+ txt.encode!(-"UTF-8", invalid: :replace, undef: :replace,
7
9
  replace: -"")
8
10
  rescue EncodingError
9
11
  -""
12
+ else
13
+ txt.delete!(-"\u0000") # get rid of nasty null bytes
14
+ txt
10
15
  end
11
16
  end
12
17
 
@@ -15,7 +20,6 @@ module TextHelper
15
20
  text.strip!
16
21
  text.gsub!("\n", SINGLE_SPACE)
17
22
  text.gsub(/\s{2,}/, SINGLE_SPACE)
18
- text.delete!(-"\u0000") # get rid of nasty null bytes
19
23
  text
20
24
  end
21
25
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.3.0"
2
+ VERSION = "0.3.1"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -1,6 +1,8 @@
1
1
  require "csv"
2
2
  require "etc"
3
3
  require "fileutils"
4
+ require "logger"
5
+ require "pathname"
4
6
 
5
7
  require "bundler/setup"
6
8
  require "oga"
@@ -24,16 +26,27 @@ module TitleGrabber
24
26
  include HTTPHelper
25
27
  include TextHelper
26
28
 
27
- attr_reader :lines, :out_path
29
+ attr_reader :lines, :out_path, :tmp_path, :logger
28
30
 
29
31
  def initialize(lines, options)
30
32
  @lines = lines
31
33
  @out_path = options[:output]
34
+ @tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
35
+
36
+ logging_target = if ENV["DEBUG"]
37
+ STDOUT
38
+ else
39
+ log_file = Pathname(__FILE__).sub_ext(".log").
40
+ basename.
41
+ open("w")
42
+ end
43
+ @logger = Logger.new(logging_target)
32
44
  end
33
45
 
34
46
  def call
47
+ install_at_exit_handler
48
+
35
49
  queue = Queue.new
36
- tmp_path = out_path.sub_ext(".tmp#{out_path.extname}")
37
50
  CSV.open(tmp_path, "w", force_quotes: true) do |csv|
38
51
  csv << HEADERS
39
52
 
@@ -62,10 +75,10 @@ module TitleGrabber
62
75
  rescue ThreadError; end
63
76
 
64
77
  while url
65
- if html = open_w_timeout(url)
78
+ if (html = open_w_timeout(url)) && !html.empty?
66
79
  doc = begin
67
80
  Oga.parse_html(html)
68
- rescue LL::ParserError => err
81
+ rescue ArgumentError, LL::ParserError => err
69
82
  logger.error "[Thread: ##{Thread.current[:id]}] Unable to parse HTML from URL '#{url}' - #{err.message}"
70
83
  nil
71
84
  end
@@ -88,12 +101,20 @@ module TitleGrabber
88
101
  end
89
102
  }.each(&:join)
90
103
  end
91
-
92
- FileUtils.mv(tmp_path, out_path)
93
104
  end
94
105
 
95
106
  private
96
107
 
108
+ def install_at_exit_handler
109
+ at_exit do
110
+ if tmp_path.size?
111
+ FileUtils.mv(tmp_path, out_path)
112
+ else
113
+ tmp_path.unlink if tmp_path.exist?
114
+ end
115
+ end
116
+ end
117
+
97
118
  def processed_urls
98
119
  @processed_urls ||= begin
99
120
  urls = {}
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch