title_grabber 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 58643e0df803b9315f741db2effdcbd8b3d4e52845b06de27f18dd726f732bb6
4
- data.tar.gz: 013fcdb1650a497126e11b62845240489c296d8de7b0dcd9bc95ccbafb288633
3
+ metadata.gz: 58ceec17ac17673c12eee2cdd5fbef23cdc561b89b5820f652e592a91047a399
4
+ data.tar.gz: 3f4be7dbff89b096c51fb28839d0bd2f4a85873507ee6600b7d2e481c16ea766
5
5
  SHA512:
6
- metadata.gz: '09167bbc4fcd61034322ab62ec5ca68ebffcb8920c9f07622e6b69367c76a639c03314101a3588ecd0e53e62b08d180e323114e5c9f70760c8416e4a342b9850'
7
- data.tar.gz: d2f053afc4fc465049d8068302e1baf56a76ef1c6230813a7facd03e24207c03bfd104ebe45c7b2ae5dbc821140a59d4d6d3c4137e766258eb4cf19fbe0a92de
6
+ metadata.gz: 2e87da24fb755d6869b68af2fd201a19e7f9ea6754b1dd26263f58124055847a9594cd186a88c39b2b91f59507f969a35b47bbf4924ca0c35c8f2776e4533836
7
+ data.tar.gz: '0957bf0dc17b6f02dff00b185ae0fd9ba163ce9610655063dab0ce8e16bd8c761db1a7cd8d283a685a408ed2470bef1ac7bcb42167f995e2857feeeb94e7647c'
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.3.7)
4
+ title_grabber (0.4.0)
5
5
  http (~> 4.1)
6
6
  oga (~> 2.15)
7
7
 
data/README.md CHANGED
@@ -23,14 +23,14 @@ Or install it yourself as:
23
23
  Just pass it a list of files containing URLs (one per line)
24
24
 
25
25
  ```
26
- title-grabber /abs/path/2/file1.txt rel/path/2/file2.txt
26
+ title-grabber -f /abs/path/2/file1.txt,rel/path/2/file2.txt
27
27
  ```
28
28
 
29
29
  Data is either recorded to out.csv in the CWD or the file specified using the
30
30
  -o/--output argument, e.g.
31
31
 
32
32
  ```
33
- title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
33
+ title-grabber -o ~/output.csv -f /abs/path/2/file1.txt,rel/path/2/file2.txt
34
34
  ```
35
35
 
36
36
  See all available CLI switches and env vars
data/exe/title-grabber CHANGED
@@ -16,6 +16,11 @@ OptionParser.new do |args|
16
16
  exit
17
17
  end
18
18
 
19
+ args.on("-f", "--files /abs/f1,rel/f2", Array, "1 or more comma-separated paths to text files containing 1 URL per line ") do |files|
20
+ arguments[:file_paths] = files.map { |f| Pathname(f).expand_path }.
21
+ select { |f| f.file? && f.exist? }
22
+ end
23
+
19
24
  args.on("-o", "--output FILE", "Output file. Defaults to #{TitleGrabber::DEF_OUT_PATH.basename}") do |out|
20
25
  arguments[:output] = Pathname(out)
21
26
  end
@@ -49,9 +54,9 @@ OptionParser.new do |args|
49
54
  end
50
55
  end.parse!
51
56
 
52
- if ARGV.empty?
57
+ if Array(arguments[:file_paths]).empty?
53
58
  STDERR.puts "At least 1 input file is required!\n"
54
59
  exit(1)
55
60
  else
56
- TitleGrabber.call(ARGF, arguments)
61
+ TitleGrabber.call(arguments)
57
62
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.4.0"
2
+ VERSION = "0.4.1"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -33,19 +33,19 @@ module TitleGrabber
33
33
  TWITTER_URL_PREFIX = -"https://#{TWITTER_HOST}"
34
34
  CSV_FIELD_SEP = -","
35
35
 
36
- def self.call(lines, options)
37
- MultiThreadedGrabber.new(lines, options).call
36
+ def self.call(options)
37
+ MultiThreadedGrabber.new(options).call
38
38
  end
39
39
 
40
40
  class MultiThreadedGrabber
41
41
  include HTTPHelper
42
42
  include TextHelper
43
43
 
44
- attr_reader :lines, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
44
+ attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
45
45
  :max_redirects, :max_retries, :max_threads, :logger
46
46
 
47
- def initialize(lines, options)
48
- @lines = lines
47
+ def initialize(options)
48
+ @file_paths = options[:file_paths]
49
49
 
50
50
  @out_path = options.fetch(:output, DEF_OUT_PATH)
51
51
  @tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
@@ -72,19 +72,20 @@ module TitleGrabber
72
72
  CSV.open(tmp_path, "w", force_quotes: true) do |csv|
73
73
  csv << HEADERS
74
74
 
75
- lines.each do |line|
76
- md = line.match(URL_RE)
77
- next unless md
75
+ file_paths.each do |file_path|
76
+ file_path.each_line do |line|
77
+ md = line.match(URL_RE)
78
+ next unless md
78
79
 
79
- url = md.to_s
80
- if h = processed_urls[url]
81
- csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
82
- next
83
- end
80
+ url = md.to_s
81
+ if h = processed_urls[url]
82
+ csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
83
+ next
84
+ end
84
85
 
85
- queue << url
86
+ queue << url
87
+ end
86
88
  end
87
- lines = nil
88
89
 
89
90
  thr_cnt = [max_threads, queue.size].min
90
91
  threads = 1.upto(thr_cnt).map.with_index { |_, i|
@@ -116,7 +117,7 @@ module TitleGrabber
116
117
  tweet_urls.compact!
117
118
  tweet_urls.uniq!
118
119
  tweet_urls.map! do |url|
119
- if res = open_w_timeout(url, **http_opts)
120
+ if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
120
121
  uri = res.uri
121
122
  uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
122
123
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-17 00:00:00.000000000 Z
11
+ date: 2019-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http