title_grabber 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +2 -2
- data/exe/title-grabber +7 -2
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +17 -16
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 58ceec17ac17673c12eee2cdd5fbef23cdc561b89b5820f652e592a91047a399
|
4
|
+
data.tar.gz: 3f4be7dbff89b096c51fb28839d0bd2f4a85873507ee6600b7d2e481c16ea766
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2e87da24fb755d6869b68af2fd201a19e7f9ea6754b1dd26263f58124055847a9594cd186a88c39b2b91f59507f969a35b47bbf4924ca0c35c8f2776e4533836
|
7
|
+
data.tar.gz: '0957bf0dc17b6f02dff00b185ae0fd9ba163ce9610655063dab0ce8e16bd8c761db1a7cd8d283a685a408ed2470bef1ac7bcb42167f995e2857feeeb94e7647c'
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -23,14 +23,14 @@ Or install it yourself as:
|
|
23
23
|
Just pass it a list of files containing URLs (one per line)
|
24
24
|
|
25
25
|
```
|
26
|
-
title-grabber /abs/path/2/file1.txt
|
26
|
+
title-grabber -f /abs/path/2/file1.txt,rel/path/2/file2.txt
|
27
27
|
```
|
28
28
|
|
29
29
|
Data is either recorded to out.csv in the CWD or the file specified using the
|
30
30
|
-o/--output argument, e.g.
|
31
31
|
|
32
32
|
```
|
33
|
-
title-grabber -o ~/output.csv /abs/path/2/file1.txt
|
33
|
+
title-grabber -o ~/output.csv -f /abs/path/2/file1.txt,rel/path/2/file2.txt
|
34
34
|
```
|
35
35
|
|
36
36
|
See all available CLI switches and env vars
|
data/exe/title-grabber
CHANGED
@@ -16,6 +16,11 @@ OptionParser.new do |args|
|
|
16
16
|
exit
|
17
17
|
end
|
18
18
|
|
19
|
+
args.on("-f", "--files /abs/f1,rel/f2", Array, "1 or more comma-separated paths to text files containing 1 URL per line ") do |files|
|
20
|
+
arguments[:file_paths] = files.map { |f| Pathname(f).expand_path }.
|
21
|
+
select { |f| f.file? && f.exist? }
|
22
|
+
end
|
23
|
+
|
19
24
|
args.on("-o", "--output FILE", "Output file. Defaults to #{TitleGrabber::DEF_OUT_PATH.basename}") do |out|
|
20
25
|
arguments[:output] = Pathname(out)
|
21
26
|
end
|
@@ -49,9 +54,9 @@ OptionParser.new do |args|
|
|
49
54
|
end
|
50
55
|
end.parse!
|
51
56
|
|
52
|
-
if
|
57
|
+
if Array(arguments[:file_paths]).empty?
|
53
58
|
STDERR.puts "At least 1 input file is required!\n"
|
54
59
|
exit(1)
|
55
60
|
else
|
56
|
-
TitleGrabber.call(
|
61
|
+
TitleGrabber.call(arguments)
|
57
62
|
end
|
data/lib/title_grabber.rb
CHANGED
@@ -33,19 +33,19 @@ module TitleGrabber
|
|
33
33
|
TWITTER_URL_PREFIX = -"https://#{TWITTER_HOST}"
|
34
34
|
CSV_FIELD_SEP = -","
|
35
35
|
|
36
|
-
def self.call(
|
37
|
-
MultiThreadedGrabber.new(
|
36
|
+
def self.call(options)
|
37
|
+
MultiThreadedGrabber.new(options).call
|
38
38
|
end
|
39
39
|
|
40
40
|
class MultiThreadedGrabber
|
41
41
|
include HTTPHelper
|
42
42
|
include TextHelper
|
43
43
|
|
44
|
-
attr_reader :
|
44
|
+
attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
|
45
45
|
:max_redirects, :max_retries, :max_threads, :logger
|
46
46
|
|
47
|
-
def initialize(
|
48
|
-
@
|
47
|
+
def initialize(options)
|
48
|
+
@file_paths = options[:file_paths]
|
49
49
|
|
50
50
|
@out_path = options.fetch(:output, DEF_OUT_PATH)
|
51
51
|
@tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
|
@@ -72,19 +72,20 @@ module TitleGrabber
|
|
72
72
|
CSV.open(tmp_path, "w", force_quotes: true) do |csv|
|
73
73
|
csv << HEADERS
|
74
74
|
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
file_paths.each do |file_path|
|
76
|
+
file_path.each_line do |line|
|
77
|
+
md = line.match(URL_RE)
|
78
|
+
next unless md
|
78
79
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
80
|
+
url = md.to_s
|
81
|
+
if h = processed_urls[url]
|
82
|
+
csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
|
83
|
+
next
|
84
|
+
end
|
84
85
|
|
85
|
-
|
86
|
+
queue << url
|
87
|
+
end
|
86
88
|
end
|
87
|
-
lines = nil
|
88
89
|
|
89
90
|
thr_cnt = [max_threads, queue.size].min
|
90
91
|
threads = 1.upto(thr_cnt).map.with_index { |_, i|
|
@@ -116,7 +117,7 @@ module TitleGrabber
|
|
116
117
|
tweet_urls.compact!
|
117
118
|
tweet_urls.uniq!
|
118
119
|
tweet_urls.map! do |url|
|
119
|
-
if res = open_w_timeout(url, **http_opts)
|
120
|
+
if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
|
120
121
|
uri = res.uri
|
121
122
|
uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
|
122
123
|
else
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: http
|