title_grabber 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -0
- data/README.md +4 -12
- data/lib/http_helper.rb +10 -8
- data/lib/text_helper.rb +2 -2
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +38 -3
- data/title_grabber.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 58643e0df803b9315f741db2effdcbd8b3d4e52845b06de27f18dd726f732bb6
|
4
|
+
data.tar.gz: 013fcdb1650a497126e11b62845240489c296d8de7b0dcd9bc95ccbafb288633
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '09167bbc4fcd61034322ab62ec5ca68ebffcb8920c9f07622e6b69367c76a639c03314101a3588ecd0e53e62b08d180e323114e5c9f70760c8416e4a342b9850'
|
7
|
+
data.tar.gz: d2f053afc4fc465049d8068302e1baf56a76ef1c6230813a7facd03e24207c03bfd104ebe45c7b2ae5dbc821140a59d4d6d3c4137e766258eb4cf19fbe0a92de
|
data/Gemfile.lock
CHANGED
@@ -24,6 +24,8 @@ GEM
|
|
24
24
|
http-form_data (2.1.1)
|
25
25
|
http_parser.rb (0.6.0)
|
26
26
|
minitest (5.11.3)
|
27
|
+
minitest-line (0.6.5)
|
28
|
+
minitest (~> 5.0)
|
27
29
|
oga (2.15)
|
28
30
|
ast
|
29
31
|
ruby-ll (~> 2.1)
|
@@ -42,6 +44,7 @@ PLATFORMS
|
|
42
44
|
DEPENDENCIES
|
43
45
|
bundler (~> 1.17)
|
44
46
|
minitest (~> 5.0)
|
47
|
+
minitest-line (~> 0.6)
|
45
48
|
rake (~> 10.0)
|
46
49
|
title_grabber!
|
47
50
|
|
data/README.md
CHANGED
@@ -33,20 +33,10 @@ Data is either recorded to out.csv in the CWD or the file specified using the
|
|
33
33
|
title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
|
34
34
|
```
|
35
35
|
|
36
|
-
|
36
|
+
See all available CLI switches and env vars
|
37
37
|
|
38
38
|
```
|
39
|
-
|
40
|
-
```
|
41
|
-
|
42
|
-
```
|
43
|
-
MAX_THREADS - max. # of threads to use, defaults to the # of CPU cores in the machine
|
44
|
-
```
|
45
|
-
|
46
|
-
```
|
47
|
-
CONNECT_TIMEOUT (in seconds) - defaults to 15
|
48
|
-
READ_TIMEOUT (in seconds) - defaults to 15
|
49
|
-
WRITE_TIMEOUT (in seconds) - defaults to 15
|
39
|
+
title-grabber -h
|
50
40
|
```
|
51
41
|
|
52
42
|
## Development
|
@@ -55,6 +45,8 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
55
45
|
|
56
46
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
57
47
|
|
48
|
+
Run rake (the default task runs the test suite) to make sure all tests pass.
|
49
|
+
|
58
50
|
## Contributing
|
59
51
|
|
60
52
|
Bug reports and pull requests are welcome on GitHub at https://github.com/cristian-rasch/title_grabber.
|
data/lib/http_helper.rb
CHANGED
@@ -21,7 +21,8 @@ module HTTPHelper
|
|
21
21
|
}
|
22
22
|
rescue HTTP::Redirector::TooManyRedirectsError
|
23
23
|
logger.warn "[#{Thread.current.name}] GET #{url} resulted in more than #{max_redirects} redirect#{'s' unless max_redirects == 1}"
|
24
|
-
|
24
|
+
nil
|
25
|
+
rescue => err
|
25
26
|
msg = err.message
|
26
27
|
|
27
28
|
if err.kind_of?(HTTP::Error) || err.kind_of?(Timeout::Error) ||
|
@@ -35,19 +36,20 @@ module HTTPHelper
|
|
35
36
|
retry
|
36
37
|
else
|
37
38
|
logger.error "[#{Thread.current.name}] URL: #{url} [#{msg}]"
|
38
|
-
nil
|
39
39
|
end
|
40
|
+
else
|
41
|
+
logger.error "[#{Thread.current.name}] URL: #{url} [#{msg}]"
|
40
42
|
end
|
41
|
-
|
42
|
-
logger.error "[#{Thread.current.name}] URL: #{url} [#{err.message}]"
|
43
|
+
|
43
44
|
nil
|
44
|
-
else
|
45
|
-
[res.uri.to_s, utf8_encode(res.to_s)]
|
46
45
|
end
|
47
46
|
end
|
48
47
|
|
49
|
-
def
|
50
|
-
|
48
|
+
def read_w_timeout(url, write_to:, connect_to:, read_to:, max_retries:)
|
49
|
+
if res = open_w_timeout(url, write_to: write_to, connect_to: connect_to,
|
50
|
+
read_to: read_to, max_retries: max_retries)
|
51
|
+
[res.uri.to_s, utf8_encode(res.to_s)]
|
52
|
+
end
|
51
53
|
end
|
52
54
|
|
53
55
|
private
|
data/lib/text_helper.rb
CHANGED
@@ -6,7 +6,7 @@ module TextHelper
|
|
6
6
|
|
7
7
|
begin
|
8
8
|
txt.encode!(-"UTF-8", invalid: :replace, undef: :replace,
|
9
|
-
|
9
|
+
replace: -"")
|
10
10
|
rescue EncodingError
|
11
11
|
-""
|
12
12
|
else
|
@@ -18,8 +18,8 @@ module TextHelper
|
|
18
18
|
# document.querySelector('title').textContent.trim().replace(/\n/g, ' ').replace(/\s{2,}/g, ' ')
|
19
19
|
def clean_up_whitespace(text)
|
20
20
|
text.strip!
|
21
|
+
text.gsub!(/\s{2,}/, SINGLE_SPACE)
|
21
22
|
text.gsub!("\n", SINGLE_SPACE)
|
22
|
-
text.gsub(/\s{2,}/, SINGLE_SPACE)
|
23
23
|
text
|
24
24
|
end
|
25
25
|
end
|
data/lib/title_grabber.rb
CHANGED
@@ -26,6 +26,12 @@ module TitleGrabber
|
|
26
26
|
ART_TIT_HEAD = -"article_title"
|
27
27
|
HEADERS = [URL_HEADER, END_URL_HEAD, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
|
28
28
|
ART_TIT_SEL = ["article h1", "h1"].freeze
|
29
|
+
TWEET_PERMA_LINK_SEL = -".tweet.permalink-tweet"
|
30
|
+
TWEET_TXT_SELS = %w(.tweet-text QuoteTweet).freeze
|
31
|
+
TWITTER_HOST = -"twitter.com"
|
32
|
+
TWITTER_STATUS_RE = %r(/status/\d+\z)
|
33
|
+
TWITTER_URL_PREFIX = -"https://#{TWITTER_HOST}"
|
34
|
+
CSV_FIELD_SEP = -","
|
29
35
|
|
30
36
|
def self.call(lines, options)
|
31
37
|
MultiThreadedGrabber.new(lines, options).call
|
@@ -72,7 +78,7 @@ module TitleGrabber
|
|
72
78
|
|
73
79
|
url = md.to_s
|
74
80
|
if h = processed_urls[url]
|
75
|
-
csv << [url, h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
|
81
|
+
csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
|
76
82
|
next
|
77
83
|
end
|
78
84
|
|
@@ -90,8 +96,9 @@ module TitleGrabber
|
|
90
96
|
rescue ThreadError; end
|
91
97
|
|
92
98
|
while url
|
93
|
-
end_url, html =
|
94
|
-
|
99
|
+
end_url, html = read_w_timeout(url, **http_opts)
|
100
|
+
|
101
|
+
if end_url && html && !html.empty?
|
95
102
|
doc = begin
|
96
103
|
Oga.parse_html(html)
|
97
104
|
rescue ArgumentError, LL::ParserError => err
|
@@ -100,6 +107,34 @@ module TitleGrabber
|
|
100
107
|
end
|
101
108
|
|
102
109
|
if doc
|
110
|
+
tweet_urls = []
|
111
|
+
TWEET_TXT_SELS.each do |tweet_txt_sel|
|
112
|
+
tweet_urls.concat(doc.css("#{TWEET_PERMA_LINK_SEL} #{tweet_txt_sel} a").
|
113
|
+
map { |a| a[-"href"] })
|
114
|
+
|
115
|
+
end
|
116
|
+
tweet_urls.compact!
|
117
|
+
tweet_urls.uniq!
|
118
|
+
tweet_urls.map! do |url|
|
119
|
+
if res = open_w_timeout(url, **http_opts)
|
120
|
+
uri = res.uri
|
121
|
+
uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
|
122
|
+
else
|
123
|
+
url
|
124
|
+
end
|
125
|
+
end
|
126
|
+
tweet_urls.compact!
|
127
|
+
tweet_urls.map! do |url|
|
128
|
+
url.start_with?("/") ? URI.join(TWITTER_URL_PREFIX, url).to_s : url
|
129
|
+
end
|
130
|
+
tweet_urls.delete_if { |url|
|
131
|
+
uri = URI(url)
|
132
|
+
uri.host == TWITTER_HOST && uri.path.count("/") > 1 &&
|
133
|
+
!uri.to_s.match?(TWITTER_STATUS_RE)
|
134
|
+
}
|
135
|
+
tweet_urls.sort!
|
136
|
+
end_url = tweet_urls.join(CSV_FIELD_SEP) unless tweet_urls.empty?
|
137
|
+
|
103
138
|
page_title = doc.at_css('title')&.text || -""
|
104
139
|
clean_up_whitespace(page_title) unless page_title.empty?
|
105
140
|
|
data/title_grabber.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: http
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - "~>"
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '5.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: minitest-line
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0.6'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0.6'
|
83
97
|
description:
|
84
98
|
email:
|
85
99
|
- cristianrasch@fastmail.fm
|