title_grabber 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8a8b57f801507ef8c856ebd741b247c30e08472b5df3be45d30e6fd7fa3c0092
4
- data.tar.gz: 75ff595060fe8dd71a669ebadcf38f040d5c47492a5c6ea770e893832493ddff
3
+ metadata.gz: efadff19dbc0622e9188c0349f80ff8d7e5a3040c91db075f3c51529efb39b03
4
+ data.tar.gz: 3230ec05cd35e8d3b797b9caa69eca8bf30a41598fb98b312b59522b9c348353
5
5
  SHA512:
6
- metadata.gz: 661dd87e4e60dcfd4a66168799f3b07babf5c21a890b2e61afb99b527b1e15cb7ebd875a60f39bd12cc2f06ef286b603efb89c4908f461606fa3c60a09ad5db6
7
- data.tar.gz: b6b1152856c43702082a788a1a056ccc2cef93101faada8890eda27b949d424cf3b9a83991fd91e07733f2c860003b26644510ae9635ea5c6336a1990f997445
6
+ metadata.gz: 02adc0c9b125a64a11fce2fc6be7ee1ab3d3798045536823f6c2763c7c004c950bfbacafdc443634e43c2f742adb42ba6213a5c1197575648b3ff23c5a2ab1e2
7
+ data.tar.gz: d487cbd2f084db0f076a42f03fe14912d614bacbf8a7ae33d3eed93f2e6125da69c5ef4433d70e94fbbe4103341bd86f07bd02c725d3721ce6f6816c00227e7e
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.3.4)
4
+ title_grabber (0.3.5)
5
5
  http (~> 4.1)
6
6
  oga (~> 2.15)
7
7
 
data/lib/http_helper.rb CHANGED
@@ -15,12 +15,11 @@ module HTTPHelper
15
15
  retries = 0
16
16
 
17
17
  begin
18
- body = Timeout.timeout(read_to) {
19
- HTTP.timeout(write: write_to, connect: connect_to, read: read_to).
20
- follow(max_hops: MAX_HOPS).
21
- get(url, ssl_context: ssl_ctx).
22
- to_s
23
- }
18
+ res = Timeout.timeout(read_to) {
19
+ HTTP.timeout(write: write_to, connect: connect_to, read: read_to).
20
+ follow(max_hops: MAX_HOPS).
21
+ get(url, ssl_context: ssl_ctx)
22
+ }
24
23
  rescue HTTP::Error, Timeout::Error => err
25
24
  msg = err.message
26
25
 
@@ -42,7 +41,7 @@ module HTTPHelper
42
41
  logger.error "[#{Thread.current.name}] URL: #{url} [#{err.message}]"
43
42
  nil
44
43
  else
45
- utf8_encode(body)
44
+ [res.uri.to_s, utf8_encode(res.to_s)]
46
45
  end
47
46
  end
48
47
 
data/lib/title_grabber.rb CHANGED
@@ -20,9 +20,11 @@ module TitleGrabber
20
20
  MAX_THREADS = Etc.nprocessors
21
21
  URL_RE = %r(https?://\S+)i
22
22
  URL_HEADER = -"url"
23
+ END_URL_HEAD = -"end_url"
23
24
  PAGE_TIT_HEAD = -"page_title"
24
25
  ART_TIT_HEAD = -"article_title"
25
- HEADERS = [URL_HEADER, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
26
+ HEADERS = [URL_HEADER, END_URL_HEAD, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
27
+ ART_TIT_SEL = ["article h1", "h1"].freeze
26
28
 
27
29
  def self.call(lines, options)
28
30
  MultiThreadedGrabber.new(lines, options).call
@@ -86,7 +88,8 @@ module TitleGrabber
86
88
  rescue ThreadError; end
87
89
 
88
90
  while url
89
- if (html = open_w_timeout(url, **http_opts)) && !html.empty?
91
+ end_url, html = open_w_timeout(url, **http_opts)
92
+ if html && !html&.empty?
90
93
  doc = begin
91
94
  Oga.parse_html(html)
92
95
  rescue ArgumentError, LL::ParserError => err
@@ -97,11 +100,16 @@ module TitleGrabber
97
100
  if doc
98
101
  page_title = doc.at_css('title')&.text || -""
99
102
  clean_up_whitespace(page_title) unless page_title.empty?
100
- article_title = doc.at_css('article h1')&.text
101
- article_title ||= doc.at_css('h1')&.text || -""
103
+
104
+ article_title = nil
105
+ ART_TIT_SEL.each do |selector|
106
+ article_title = doc.at_css(selector)&.text
107
+ break if article_title && !article_title.empty?
108
+ end
109
+ article_title ||= -""
102
110
  clean_up_whitespace(article_title) unless article_title.empty?
103
111
 
104
- csv << [url, page_title, article_title]
112
+ csv << [url, end_url, page_title, article_title]
105
113
  end
106
114
  end
107
115
 
@@ -128,11 +136,13 @@ module TitleGrabber
128
136
 
129
137
  if out_path.exist?
130
138
  CSV.foreach(out_path, headers: true) do |r|
139
+ end_url = r[END_URL_HEAD]
131
140
  page_tit = r[PAGE_TIT_HEAD]
132
141
  art_tit = r[ART_TIT_HEAD]
133
142
 
134
143
  unless page_tit.empty? && art_tit.empty?
135
- urls[r[URL_HEADER]] = { PAGE_TIT_HEAD => page_tit,
144
+ urls[r[URL_HEADER]] = { END_URL_HEAD => end_url,
145
+ PAGE_TIT_HEAD => page_tit,
136
146
  ART_TIT_HEAD => art_tit }
137
147
  end
138
148
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.3.4"
2
+ VERSION = "0.3.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-07 00:00:00.000000000 Z
11
+ date: 2019-04-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http