title_grabber 0.4.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +2 -2
- data/exe/title-grabber +7 -2
- data/lib/title_grabber/version.rb +1 -1
- data/lib/title_grabber.rb +17 -16
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 58ceec17ac17673c12eee2cdd5fbef23cdc561b89b5820f652e592a91047a399
|
4
|
+
data.tar.gz: 3f4be7dbff89b096c51fb28839d0bd2f4a85873507ee6600b7d2e481c16ea766
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2e87da24fb755d6869b68af2fd201a19e7f9ea6754b1dd26263f58124055847a9594cd186a88c39b2b91f59507f969a35b47bbf4924ca0c35c8f2776e4533836
|
7
|
+
data.tar.gz: '0957bf0dc17b6f02dff00b185ae0fd9ba163ce9610655063dab0ce8e16bd8c761db1a7cd8d283a685a408ed2470bef1ac7bcb42167f995e2857feeeb94e7647c'
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -23,14 +23,14 @@ Or install it yourself as:
|
|
23
23
|
Just pass it a list of files containing URLs (one per line)
|
24
24
|
|
25
25
|
```
|
26
|
-
title-grabber /abs/path/2/file1.txt
|
26
|
+
title-grabber -f /abs/path/2/file1.txt,rel/path/2/file2.txt
|
27
27
|
```
|
28
28
|
|
29
29
|
Data is either recorded to out.csv in the CWD or the file specified using the
|
30
30
|
-o/--output argument, e.g.
|
31
31
|
|
32
32
|
```
|
33
|
-
title-grabber -o ~/output.csv /abs/path/2/file1.txt
|
33
|
+
title-grabber -o ~/output.csv -f /abs/path/2/file1.txt,rel/path/2/file2.txt
|
34
34
|
```
|
35
35
|
|
36
36
|
See all available CLI switches and env vars
|
data/exe/title-grabber
CHANGED
@@ -16,6 +16,11 @@ OptionParser.new do |args|
|
|
16
16
|
exit
|
17
17
|
end
|
18
18
|
|
19
|
+
args.on("-f", "--files /abs/f1,rel/f2", Array, "1 or more comma-separated paths to text files containing 1 URL per line ") do |files|
|
20
|
+
arguments[:file_paths] = files.map { |f| Pathname(f).expand_path }.
|
21
|
+
select { |f| f.file? && f.exist? }
|
22
|
+
end
|
23
|
+
|
19
24
|
args.on("-o", "--output FILE", "Output file. Defaults to #{TitleGrabber::DEF_OUT_PATH.basename}") do |out|
|
20
25
|
arguments[:output] = Pathname(out)
|
21
26
|
end
|
@@ -49,9 +54,9 @@ OptionParser.new do |args|
|
|
49
54
|
end
|
50
55
|
end.parse!
|
51
56
|
|
52
|
-
if
|
57
|
+
if Array(arguments[:file_paths]).empty?
|
53
58
|
STDERR.puts "At least 1 input file is required!\n"
|
54
59
|
exit(1)
|
55
60
|
else
|
56
|
-
TitleGrabber.call(
|
61
|
+
TitleGrabber.call(arguments)
|
57
62
|
end
|
data/lib/title_grabber.rb
CHANGED
@@ -33,19 +33,19 @@ module TitleGrabber
|
|
33
33
|
TWITTER_URL_PREFIX = -"https://#{TWITTER_HOST}"
|
34
34
|
CSV_FIELD_SEP = -","
|
35
35
|
|
36
|
-
def self.call(
|
37
|
-
MultiThreadedGrabber.new(
|
36
|
+
def self.call(options)
|
37
|
+
MultiThreadedGrabber.new(options).call
|
38
38
|
end
|
39
39
|
|
40
40
|
class MultiThreadedGrabber
|
41
41
|
include HTTPHelper
|
42
42
|
include TextHelper
|
43
43
|
|
44
|
-
attr_reader :
|
44
|
+
attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
|
45
45
|
:max_redirects, :max_retries, :max_threads, :logger
|
46
46
|
|
47
|
-
def initialize(
|
48
|
-
@
|
47
|
+
def initialize(options)
|
48
|
+
@file_paths = options[:file_paths]
|
49
49
|
|
50
50
|
@out_path = options.fetch(:output, DEF_OUT_PATH)
|
51
51
|
@tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
|
@@ -72,19 +72,20 @@ module TitleGrabber
|
|
72
72
|
CSV.open(tmp_path, "w", force_quotes: true) do |csv|
|
73
73
|
csv << HEADERS
|
74
74
|
|
75
|
-
|
76
|
-
|
77
|
-
|
75
|
+
file_paths.each do |file_path|
|
76
|
+
file_path.each_line do |line|
|
77
|
+
md = line.match(URL_RE)
|
78
|
+
next unless md
|
78
79
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
80
|
+
url = md.to_s
|
81
|
+
if h = processed_urls[url]
|
82
|
+
csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
|
83
|
+
next
|
84
|
+
end
|
84
85
|
|
85
|
-
|
86
|
+
queue << url
|
87
|
+
end
|
86
88
|
end
|
87
|
-
lines = nil
|
88
89
|
|
89
90
|
thr_cnt = [max_threads, queue.size].min
|
90
91
|
threads = 1.upto(thr_cnt).map.with_index { |_, i|
|
@@ -116,7 +117,7 @@ module TitleGrabber
|
|
116
117
|
tweet_urls.compact!
|
117
118
|
tweet_urls.uniq!
|
118
119
|
tweet_urls.map! do |url|
|
119
|
-
if res = open_w_timeout(url, **http_opts)
|
120
|
+
if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
|
120
121
|
uri = res.uri
|
121
122
|
uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
|
122
123
|
else
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: title_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Cristian Rasch
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-04-
|
11
|
+
date: 2019-04-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: http
|