title_grabber 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8a8b57f801507ef8c856ebd741b247c30e08472b5df3be45d30e6fd7fa3c0092
4
- data.tar.gz: 75ff595060fe8dd71a669ebadcf38f040d5c47492a5c6ea770e893832493ddff
3
+ metadata.gz: efadff19dbc0622e9188c0349f80ff8d7e5a3040c91db075f3c51529efb39b03
4
+ data.tar.gz: 3230ec05cd35e8d3b797b9caa69eca8bf30a41598fb98b312b59522b9c348353
5
5
  SHA512:
6
- metadata.gz: 661dd87e4e60dcfd4a66168799f3b07babf5c21a890b2e61afb99b527b1e15cb7ebd875a60f39bd12cc2f06ef286b603efb89c4908f461606fa3c60a09ad5db6
7
- data.tar.gz: b6b1152856c43702082a788a1a056ccc2cef93101faada8890eda27b949d424cf3b9a83991fd91e07733f2c860003b26644510ae9635ea5c6336a1990f997445
6
+ metadata.gz: 02adc0c9b125a64a11fce2fc6be7ee1ab3d3798045536823f6c2763c7c004c950bfbacafdc443634e43c2f742adb42ba6213a5c1197575648b3ff23c5a2ab1e2
7
+ data.tar.gz: d487cbd2f084db0f076a42f03fe14912d614bacbf8a7ae33d3eed93f2e6125da69c5ef4433d70e94fbbe4103341bd86f07bd02c725d3721ce6f6816c00227e7e
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.3.4)
4
+ title_grabber (0.3.5)
5
5
  http (~> 4.1)
6
6
  oga (~> 2.15)
7
7
 
data/lib/http_helper.rb CHANGED
@@ -15,12 +15,11 @@ module HTTPHelper
15
15
  retries = 0
16
16
 
17
17
  begin
18
- body = Timeout.timeout(read_to) {
19
- HTTP.timeout(write: write_to, connect: connect_to, read: read_to).
20
- follow(max_hops: MAX_HOPS).
21
- get(url, ssl_context: ssl_ctx).
22
- to_s
23
- }
18
+ res = Timeout.timeout(read_to) {
19
+ HTTP.timeout(write: write_to, connect: connect_to, read: read_to).
20
+ follow(max_hops: MAX_HOPS).
21
+ get(url, ssl_context: ssl_ctx)
22
+ }
24
23
  rescue HTTP::Error, Timeout::Error => err
25
24
  msg = err.message
26
25
 
@@ -42,7 +41,7 @@ module HTTPHelper
42
41
  logger.error "[#{Thread.current.name}] URL: #{url} [#{err.message}]"
43
42
  nil
44
43
  else
45
- utf8_encode(body)
44
+ [res.uri.to_s, utf8_encode(res.to_s)]
46
45
  end
47
46
  end
48
47
 
data/lib/title_grabber.rb CHANGED
@@ -20,9 +20,11 @@ module TitleGrabber
20
20
  MAX_THREADS = Etc.nprocessors
21
21
  URL_RE = %r(https?://\S+)i
22
22
  URL_HEADER = -"url"
23
+ END_URL_HEAD = -"end_url"
23
24
  PAGE_TIT_HEAD = -"page_title"
24
25
  ART_TIT_HEAD = -"article_title"
25
- HEADERS = [URL_HEADER, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
26
+ HEADERS = [URL_HEADER, END_URL_HEAD, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
27
+ ART_TIT_SEL = ["article h1", "h1"].freeze
26
28
 
27
29
  def self.call(lines, options)
28
30
  MultiThreadedGrabber.new(lines, options).call
@@ -86,7 +88,8 @@ module TitleGrabber
86
88
  rescue ThreadError; end
87
89
 
88
90
  while url
89
- if (html = open_w_timeout(url, **http_opts)) && !html.empty?
91
+ end_url, html = open_w_timeout(url, **http_opts)
92
+ if html && !html&.empty?
90
93
  doc = begin
91
94
  Oga.parse_html(html)
92
95
  rescue ArgumentError, LL::ParserError => err
@@ -97,11 +100,16 @@ module TitleGrabber
97
100
  if doc
98
101
  page_title = doc.at_css('title')&.text || -""
99
102
  clean_up_whitespace(page_title) unless page_title.empty?
100
- article_title = doc.at_css('article h1')&.text
101
- article_title ||= doc.at_css('h1')&.text || -""
103
+
104
+ article_title = nil
105
+ ART_TIT_SEL.each do |selector|
106
+ article_title = doc.at_css(selector)&.text
107
+ break if article_title && !article_title.empty?
108
+ end
109
+ article_title ||= -""
102
110
  clean_up_whitespace(article_title) unless article_title.empty?
103
111
 
104
- csv << [url, page_title, article_title]
112
+ csv << [url, end_url, page_title, article_title]
105
113
  end
106
114
  end
107
115
 
@@ -128,11 +136,13 @@ module TitleGrabber
128
136
 
129
137
  if out_path.exist?
130
138
  CSV.foreach(out_path, headers: true) do |r|
139
+ end_url = r[END_URL_HEAD]
131
140
  page_tit = r[PAGE_TIT_HEAD]
132
141
  art_tit = r[ART_TIT_HEAD]
133
142
 
134
143
  unless page_tit.empty? && art_tit.empty?
135
- urls[r[URL_HEADER]] = { PAGE_TIT_HEAD => page_tit,
144
+ urls[r[URL_HEADER]] = { END_URL_HEAD => end_url,
145
+ PAGE_TIT_HEAD => page_tit,
136
146
  ART_TIT_HEAD => art_tit }
137
147
  end
138
148
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.3.4"
2
+ VERSION = "0.3.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-07 00:00:00.000000000 Z
11
+ date: 2019-04-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http