title_grabber 0.3.7 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6642eaa211c58d0debf01ff8b0129832ef2c83d7e280f048bab0656ae7f0aec1
4
- data.tar.gz: 3d9bc77e04fd081dd9c3792c9cc378c6eac60a818c8cdd9efdfed134f52ec57d
3
+ metadata.gz: 58643e0df803b9315f741db2effdcbd8b3d4e52845b06de27f18dd726f732bb6
4
+ data.tar.gz: 013fcdb1650a497126e11b62845240489c296d8de7b0dcd9bc95ccbafb288633
5
5
  SHA512:
6
- metadata.gz: e5455e96ff7c4ad6654c0c3e57fff4dc60627c55fe586d7bc72f9492c591765451101bfaee1a3537a881713f469600ffc7343081ff9fcc73d56aa733af42e4e1
7
- data.tar.gz: 5474dcf7991beb7ea7c48cfe95fa98ab02cd976d068da9a1246ad518eb4963b5c701f46f54c41735b2d77f5309391dcf8f82692b803b401c1b70ef1e75d3147c
6
+ metadata.gz: '09167bbc4fcd61034322ab62ec5ca68ebffcb8920c9f07622e6b69367c76a639c03314101a3588ecd0e53e62b08d180e323114e5c9f70760c8416e4a342b9850'
7
+ data.tar.gz: d2f053afc4fc465049d8068302e1baf56a76ef1c6230813a7facd03e24207c03bfd104ebe45c7b2ae5dbc821140a59d4d6d3c4137e766258eb4cf19fbe0a92de
data/Gemfile.lock CHANGED
@@ -24,6 +24,8 @@ GEM
24
24
  http-form_data (2.1.1)
25
25
  http_parser.rb (0.6.0)
26
26
  minitest (5.11.3)
27
+ minitest-line (0.6.5)
28
+ minitest (~> 5.0)
27
29
  oga (2.15)
28
30
  ast
29
31
  ruby-ll (~> 2.1)
@@ -42,6 +44,7 @@ PLATFORMS
42
44
  DEPENDENCIES
43
45
  bundler (~> 1.17)
44
46
  minitest (~> 5.0)
47
+ minitest-line (~> 0.6)
45
48
  rake (~> 10.0)
46
49
  title_grabber!
47
50
 
data/README.md CHANGED
@@ -33,20 +33,10 @@ Data is either recorded to out.csv in the CWD or the file specified using the
33
33
  title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
34
34
  ```
35
35
 
36
- ### Environment Variables
36
+ See all available CLI switches and env vars
37
37
 
38
38
  ```
39
- DEBUG - when set it logs to STDOUT instead of to its default target, title_grabber.log
40
- ```
41
-
42
- ```
43
- MAX_THREADS - max. # of threads to use, defaults to the # of CPU cores in the machine
44
- ```
45
-
46
- ```
47
- CONNECT_TIMEOUT (in seconds) - defaults to 15
48
- READ_TIMEOUT (in seconds) - defaults to 15
49
- WRITE_TIMEOUT (in seconds) - defaults to 15
39
+ title-grabber -h
50
40
  ```
51
41
 
52
42
  ## Development
@@ -55,6 +45,8 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
55
45
 
56
46
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
57
47
 
48
+ Run rake (the default task runs the test suite) to make sure all tests pass.
49
+
58
50
  ## Contributing
59
51
 
60
52
  Bug reports and pull requests are welcome on GitHub at https://github.com/cristian-rasch/title_grabber.
data/lib/http_helper.rb CHANGED
@@ -21,7 +21,8 @@ module HTTPHelper
21
21
  }
22
22
  rescue HTTP::Redirector::TooManyRedirectsError
23
23
  logger.warn "[#{Thread.current.name}] GET #{url} resulted in more than #{max_redirects} redirect#{'s' unless max_redirects == 1}"
24
- rescue HTTP::Error, Timeout::Error => err
24
+ nil
25
+ rescue => err
25
26
  msg = err.message
26
27
 
27
28
  if err.kind_of?(HTTP::Error) || err.kind_of?(Timeout::Error) ||
@@ -35,19 +36,20 @@ module HTTPHelper
35
36
  retry
36
37
  else
37
38
  logger.error "[#{Thread.current.name}] URL: #{url} [#{msg}]"
38
- nil
39
39
  end
40
+ else
41
+ logger.error "[#{Thread.current.name}] URL: #{url} [#{msg}]"
40
42
  end
41
- rescue => err
42
- logger.error "[#{Thread.current.name}] URL: #{url} [#{err.message}]"
43
+
43
44
  nil
44
- else
45
- [res.uri.to_s, utf8_encode(res.to_s)]
46
45
  end
47
46
  end
48
47
 
49
- def logger
50
- @logger ||= Logger.new(STDOUT)
48
+ def read_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
49
+ if res = open_w_timeout(url, write_to: write_to, connect_to: connect_to,
50
+ read_to: read_to, max_retries: max_retries)
51
+ [res.uri.to_s, utf8_encode(res.to_s)]
52
+ end
51
53
  end
52
54
 
53
55
  private
data/lib/text_helper.rb CHANGED
@@ -6,7 +6,7 @@ module TextHelper
6
6
 
7
7
  begin
8
8
  txt.encode!(-"UTF-8", invalid: :replace, undef: :replace,
9
- replace: -"")
9
+ replace: -"")
10
10
  rescue EncodingError
11
11
  -""
12
12
  else
@@ -18,8 +18,8 @@ module TextHelper
18
18
  # document.querySelector('title').textContent.trim().replace(/\n/g, ' ').replace(/\s{2,}/g, ' ')
19
19
  def clean_up_whitespace(text)
20
20
  text.strip!
21
+ text.gsub!(/\s{2,}/, SINGLE_SPACE)
21
22
  text.gsub!("\n", SINGLE_SPACE)
22
- text.gsub(/\s{2,}/, SINGLE_SPACE)
23
23
  text
24
24
  end
25
25
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.3.7"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -26,6 +26,12 @@ module TitleGrabber
26
26
  ART_TIT_HEAD = -"article_title"
27
27
  HEADERS = [URL_HEADER, END_URL_HEAD, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
28
28
  ART_TIT_SEL = ["article h1", "h1"].freeze
29
+ TWEET_PERMA_LINK_SEL = -".tweet.permalink-tweet"
30
+ TWEET_TXT_SELS = %w(.tweet-text QuoteTweet).freeze
31
+ TWITTER_HOST = -"twitter.com"
32
+ TWITTER_STATUS_RE = %r(/status/\d+\z)
33
+ TWITTER_URL_PREFIX = -"https://#{TWITTER_HOST}"
34
+ CSV_FIELD_SEP = -","
29
35
 
30
36
  def self.call(lines, options)
31
37
  MultiThreadedGrabber.new(lines, options).call
@@ -72,7 +78,7 @@ module TitleGrabber
72
78
 
73
79
  url = md.to_s
74
80
  if h = processed_urls[url]
75
- csv << [url, h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
81
+ csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
76
82
  next
77
83
  end
78
84
 
@@ -90,8 +96,9 @@ module TitleGrabber
90
96
  rescue ThreadError; end
91
97
 
92
98
  while url
93
- end_url, html = open_w_timeout(url, **http_opts)
94
- if html && !html&.empty?
99
+ end_url, html = read_w_timeout(url, **http_opts)
100
+
101
+ if end_url && html && !html.empty?
95
102
  doc = begin
96
103
  Oga.parse_html(html)
97
104
  rescue ArgumentError, LL::ParserError => err
@@ -100,6 +107,34 @@ module TitleGrabber
100
107
  end
101
108
 
102
109
  if doc
110
+ tweet_urls = []
111
+ TWEET_TXT_SELS.each do |tweet_txt_sel|
112
+ tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
113
+ map { |a| a[-"href"] })
114
+
115
+ end
116
+ tweet_urls.compact!
117
+ tweet_urls.uniq!
118
+ tweet_urls.map! do |url|
119
+ if res = open_w_timeout(url, **http_opts)
120
+ uri = res.uri
121
+ uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
122
+ else
123
+ url
124
+ end
125
+ end
126
+ tweet_urls.compact!
127
+ tweet_urls.map! do |url|
128
+ url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
129
+ end
130
+ tweet_urls.delete_if { |url|
131
+ uri = URI(url)
132
+ uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
133
+ !uri.to_s.match?(TWITTER_STATUS_RE)
134
+ }
135
+ tweet_urls.sort!
136
+ end_url = tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
137
+
103
138
  page_title = doc.at_css('title')&.text || -""
104
139
  clean_up_whitespace(page_title) unless page_title.empty?
105
140
 
@@ -44,4 +44,5 @@ Gem::Specification.new do |spec|
44
44
  spec.add_development_dependency "bundler", "~> 1.17"
45
45
  spec.add_development_dependency "rake", "~> 10.0"
46
46
  spec.add_development_dependency "minitest", "~> 5.0"
47
+ spec.add_development_dependency "minitest-line", "~> 0.6"
47
48
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.7
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-10 00:00:00.000000000 Z
11
+ date: 2019-04-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '5.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: minitest-line
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.6'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.6'
83
97
  description:
84
98
  email:
85
99
  - cristianrasch@fastmail.fm