title_grabber 0.3.7 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6642eaa211c58d0debf01ff8b0129832ef2c83d7e280f048bab0656ae7f0aec1
4
- data.tar.gz: 3d9bc77e04fd081dd9c3792c9cc378c6eac60a818c8cdd9efdfed134f52ec57d
3
+ metadata.gz: 58643e0df803b9315f741db2effdcbd8b3d4e52845b06de27f18dd726f732bb6
4
+ data.tar.gz: 013fcdb1650a497126e11b62845240489c296d8de7b0dcd9bc95ccbafb288633
5
5
  SHA512:
6
- metadata.gz: e5455e96ff7c4ad6654c0c3e57fff4dc60627c55fe586d7bc72f9492c591765451101bfaee1a3537a881713f469600ffc7343081ff9fcc73d56aa733af42e4e1
7
- data.tar.gz: 5474dcf7991beb7ea7c48cfe95fa98ab02cd976d068da9a1246ad518eb4963b5c701f46f54c41735b2d77f5309391dcf8f82692b803b401c1b70ef1e75d3147c
6
+ metadata.gz: '09167bbc4fcd61034322ab62ec5ca68ebffcb8920c9f07622e6b69367c76a639c03314101a3588ecd0e53e62b08d180e323114e5c9f70760c8416e4a342b9850'
7
+ data.tar.gz: d2f053afc4fc465049d8068302e1baf56a76ef1c6230813a7facd03e24207c03bfd104ebe45c7b2ae5dbc821140a59d4d6d3c4137e766258eb4cf19fbe0a92de
data/Gemfile.lock CHANGED
@@ -24,6 +24,8 @@ GEM
24
24
  http-form_data (2.1.1)
25
25
  http_parser.rb (0.6.0)
26
26
  minitest (5.11.3)
27
+ minitest-line (0.6.5)
28
+ minitest (~> 5.0)
27
29
  oga (2.15)
28
30
  ast
29
31
  ruby-ll (~> 2.1)
@@ -42,6 +44,7 @@ PLATFORMS
42
44
  DEPENDENCIES
43
45
  bundler (~> 1.17)
44
46
  minitest (~> 5.0)
47
+ minitest-line (~> 0.6)
45
48
  rake (~> 10.0)
46
49
  title_grabber!
47
50
 
data/README.md CHANGED
@@ -33,20 +33,10 @@ Data is either recorded to out.csv in the CWD or the file specified using the
33
33
  title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
34
34
  ```
35
35
 
36
- ### Environment Variables
36
+ See all available CLI switches and env vars
37
37
 
38
38
  ```
39
- DEBUG - when set it logs to STDOUT instead of to its default target, title_grabber.log
40
- ```
41
-
42
- ```
43
- MAX_THREADS - max. # of threads to use, defaults to the # of CPU cores in the machine
44
- ```
45
-
46
- ```
47
- CONNECT_TIMEOUT (in seconds) - defaults to 15
48
- READ_TIMEOUT (in seconds) - defaults to 15
49
- WRITE_TIMEOUT (in seconds) - defaults to 15
39
+ title-grabber -h
50
40
  ```
51
41
 
52
42
  ## Development
@@ -55,6 +45,8 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
55
45
 
56
46
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
57
47
 
48
+ Run rake (the default task runs the test suite) to make sure all tests pass.
49
+
58
50
  ## Contributing
59
51
 
60
52
  Bug reports and pull requests are welcome on GitHub at https://github.com/cristian-rasch/title_grabber.
data/lib/http_helper.rb CHANGED
@@ -21,7 +21,8 @@ module HTTPHelper
21
21
  }
22
22
  rescue HTTP::Redirector::TooManyRedirectsError
23
23
  logger.warn "[#{Thread.current.name}] GET #{url} resulted in more than #{max_redirects} redirect#{'s' unless max_redirects == 1}"
24
- rescue HTTP::Error, Timeout::Error => err
24
+ nil
25
+ rescue => err
25
26
  msg = err.message
26
27
 
27
28
  if err.kind_of?(HTTP::Error) || err.kind_of?(Timeout::Error) ||
@@ -35,19 +36,20 @@ module HTTPHelper
35
36
  retry
36
37
  else
37
38
  logger.error "[#{Thread.current.name}] URL: #{url} [#{msg}]"
38
- nil
39
39
  end
40
+ else
41
+ logger.error "[#{Thread.current.name}] URL: #{url} [#{msg}]"
40
42
  end
41
- rescue => err
42
- logger.error "[#{Thread.current.name}] URL: #{url} [#{err.message}]"
43
+
43
44
  nil
44
- else
45
- [res.uri.to_s, utf8_encode(res.to_s)]
46
45
  end
47
46
  end
48
47
 
49
- def logger
50
- @logger ||= Logger.new(STDOUT)
48
+ def read_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
49
+ if res = open_w_timeout(url, write_to: write_to, connect_to: connect_to,
50
+ read_to: read_to, max_retries: max_retries)
51
+ [res.uri.to_s, utf8_encode(res.to_s)]
52
+ end
51
53
  end
52
54
 
53
55
  private
data/lib/text_helper.rb CHANGED
@@ -6,7 +6,7 @@ module TextHelper
6
6
 
7
7
  begin
8
8
  txt.encode!(-"UTF-8", invalid: :replace, undef: :replace,
9
- replace: -"")
9
+ replace: -"")
10
10
  rescue EncodingError
11
11
  -""
12
12
  else
@@ -18,8 +18,8 @@ module TextHelper
18
18
  # document.querySelector('title').textContent.trim().replace(/\n/g, ' ').replace(/\s{2,}/g, ' ')
19
19
  def clean_up_whitespace(text)
20
20
  text.strip!
21
+ text.gsub!(/\s{2,}/, SINGLE_SPACE)
21
22
  text.gsub!("\n", SINGLE_SPACE)
22
- text.gsub(/\s{2,}/, SINGLE_SPACE)
23
23
  text
24
24
  end
25
25
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.3.7"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -26,6 +26,12 @@ module TitleGrabber
26
26
  ART_TIT_HEAD = -"article_title"
27
27
  HEADERS = [URL_HEADER, END_URL_HEAD, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
28
28
  ART_TIT_SEL = ["article h1", "h1"].freeze
29
+ TWEET_PERMA_LINK_SEL = -".tweet.permalink-tweet"
30
+ TWEET_TXT_SELS = %w(.tweet-text QuoteTweet).freeze
31
+ TWITTER_HOST = -"twitter.com"
32
+ TWITTER_STATUS_RE = %r(/status/\d+\z)
33
+ TWITTER_URL_PREFIX = -"https://#{TWITTER_HOST}"
34
+ CSV_FIELD_SEP = -","
29
35
 
30
36
  def self.call(lines, options)
31
37
  MultiThreadedGrabber.new(lines, options).call
@@ -72,7 +78,7 @@ module TitleGrabber
72
78
 
73
79
  url = md.to_s
74
80
  if h = processed_urls[url]
75
- csv << [url, h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
81
+ csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
76
82
  next
77
83
  end
78
84
 
@@ -90,8 +96,9 @@ module TitleGrabber
90
96
  rescue ThreadError; end
91
97
 
92
98
  while url
93
- end_url, html = open_w_timeout(url, **http_opts)
94
- if html && !html&.empty?
99
+ end_url, html = read_w_timeout(url, **http_opts)
100
+
101
+ if end_url && html && !html.empty?
95
102
  doc = begin
96
103
  Oga.parse_html(html)
97
104
  rescue ArgumentError, LL::ParserError => err
@@ -100,6 +107,34 @@ module TitleGrabber
100
107
  end
101
108
 
102
109
  if doc
110
+ tweet_urls = []
111
+ TWEET_TXT_SELS.each do |tweet_txt_sel|
112
+ tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
113
+ map { |a| a[-"href"] })
114
+
115
+ end
116
+ tweet_urls.compact!
117
+ tweet_urls.uniq!
118
+ tweet_urls.map! do |url|
119
+ if res = open_w_timeout(url, **http_opts)
120
+ uri = res.uri
121
+ uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
122
+ else
123
+ url
124
+ end
125
+ end
126
+ tweet_urls.compact!
127
+ tweet_urls.map! do |url|
128
+ url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
129
+ end
130
+ tweet_urls.delete_if { |url|
131
+ uri = URI(url)
132
+ uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
133
+ !uri.to_s.match?(TWITTER_STATUS_RE)
134
+ }
135
+ tweet_urls.sort!
136
+ end_url = tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
137
+
103
138
  page_title = doc.at_css('title')&.text || -""
104
139
  clean_up_whitespace(page_title) unless page_title.empty?
105
140
 
@@ -44,4 +44,5 @@ Gem::Specification.new do |spec|
44
44
  spec.add_development_dependency "bundler", "~> 1.17"
45
45
  spec.add_development_dependency "rake", "~> 10.0"
46
46
  spec.add_development_dependency "minitest", "~> 5.0"
47
+ spec.add_development_dependency "minitest-line", "~> 0.6"
47
48
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.7
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-10 00:00:00.000000000 Z
11
+ date: 2019-04-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
82
  version: '5.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: minitest-line
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '0.6'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '0.6'
83
97
  description:
84
98
  email:
85
99
  - cristianrasch@fastmail.fm