title_grabber 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 58643e0df803b9315f741db2effdcbd8b3d4e52845b06de27f18dd726f732bb6
4
- data.tar.gz: 013fcdb1650a497126e11b62845240489c296d8de7b0dcd9bc95ccbafb288633
3
+ metadata.gz: 58ceec17ac17673c12eee2cdd5fbef23cdc561b89b5820f652e592a91047a399
4
+ data.tar.gz: 3f4be7dbff89b096c51fb28839d0bd2f4a85873507ee6600b7d2e481c16ea766
5
5
  SHA512:
6
- metadata.gz: '09167bbc4fcd61034322ab62ec5ca68ebffcb8920c9f07622e6b69367c76a639c03314101a3588ecd0e53e62b08d180e323114e5c9f70760c8416e4a342b9850'
7
- data.tar.gz: d2f053afc4fc465049d8068302e1baf56a76ef1c6230813a7facd03e24207c03bfd104ebe45c7b2ae5dbc821140a59d4d6d3c4137e766258eb4cf19fbe0a92de
6
+ metadata.gz: 2e87da24fb755d6869b68af2fd201a19e7f9ea6754b1dd26263f58124055847a9594cd186a88c39b2b91f59507f969a35b47bbf4924ca0c35c8f2776e4533836
7
+ data.tar.gz: '0957bf0dc17b6f02dff00b185ae0fd9ba163ce9610655063dab0ce8e16bd8c761db1a7cd8d283a685a408ed2470bef1ac7bcb42167f995e2857feeeb94e7647c'
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- title_grabber (0.3.7)
4
+ title_grabber (0.4.0)
5
5
  http (~> 4.1)
6
6
  oga (~> 2.15)
7
7
 
data/README.md CHANGED
@@ -23,14 +23,14 @@ Or install it yourself as:
23
23
  Just pass it a list of files containing URLs (one per line)
24
24
 
25
25
  ```
26
- title-grabber /abs/path/2/file1.txt rel/path/2/file2.txt
26
+ title-grabber -f /abs/path/2/file1.txt,rel/path/2/file2.txt
27
27
  ```
28
28
 
29
29
  Data is either recorded to out.csv in the CWD or the file specified using the
30
30
  -o/--output argument, e.g.
31
31
 
32
32
  ```
33
- title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
33
+ title-grabber -o ~/output.csv -f /abs/path/2/file1.txt,rel/path/2/file2.txt
34
34
  ```
35
35
 
36
36
  See all available CLI switches and env vars
data/exe/title-grabber CHANGED
@@ -16,6 +16,11 @@ OptionParser.new do |args|
16
16
  exit
17
17
  end
18
18
 
19
+ args.on("-f", "--files /abs/f1,rel/f2", Array, "1 or more comma-separated paths to text files containing 1 URL per line ") do |files|
20
+ arguments[:file_paths] = files.map { |f| Pathname(f).expand_path }.
21
+ select { |f| f.file? && f.exist? }
22
+ end
23
+
19
24
  args.on("-o", "--output FILE", "Output file. Defaults to #{TitleGrabber::DEF_OUT_PATH.basename}") do |out|
20
25
  arguments[:output] = Pathname(out)
21
26
  end
@@ -49,9 +54,9 @@ OptionParser.new do |args|
49
54
  end
50
55
  end.parse!
51
56
 
52
- if ARGV.empty?
57
+ if Array(arguments[:file_paths]).empty?
53
58
  STDERR.puts "At least 1 input file is required!\n"
54
59
  exit(1)
55
60
  else
56
- TitleGrabber.call(ARGF, arguments)
61
+ TitleGrabber.call(arguments)
57
62
  end
@@ -1,3 +1,3 @@
1
1
  module TitleGrabber
2
- VERSION = "0.4.0"
2
+ VERSION = "0.4.1"
3
3
  end
data/lib/title_grabber.rb CHANGED
@@ -33,19 +33,19 @@ module TitleGrabber
33
33
  TWITTER_URL_PREFIX = -"https://#{TWITTER_HOST}"
34
34
  CSV_FIELD_SEP = -","
35
35
 
36
- def self.call(lines, options)
37
- MultiThreadedGrabber.new(lines, options).call
36
+ def self.call(options)
37
+ MultiThreadedGrabber.new(options).call
38
38
  end
39
39
 
40
40
  class MultiThreadedGrabber
41
41
  include HTTPHelper
42
42
  include TextHelper
43
43
 
44
- attr_reader :lines, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
44
+ attr_reader :file_paths, :out_path, :tmp_path, :connect_to, :read_to, :write_to,
45
45
  :max_redirects, :max_retries, :max_threads, :logger
46
46
 
47
- def initialize(lines, options)
48
- @lines = lines
47
+ def initialize(options)
48
+ @file_paths = options[:file_paths]
49
49
 
50
50
  @out_path = options.fetch(:output, DEF_OUT_PATH)
51
51
  @tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")
@@ -72,19 +72,20 @@ module TitleGrabber
72
72
  CSV.open(tmp_path, "w", force_quotes: true) do |csv|
73
73
  csv << HEADERS
74
74
 
75
- lines.each do |line|
76
- md = line.match(URL_RE)
77
- next unless md
75
+ file_paths.each do |file_path|
76
+ file_path.each_line do |line|
77
+ md = line.match(URL_RE)
78
+ next unless md
78
79
 
79
- url = md.to_s
80
- if h = processed_urls[url]
81
- csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
82
- next
83
- end
80
+ url = md.to_s
81
+ if h = processed_urls[url]
82
+ csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
83
+ next
84
+ end
84
85
 
85
- queue << url
86
+ queue << url
87
+ end
86
88
  end
87
- lines = nil
88
89
 
89
90
  thr_cnt = [max_threads, queue.size].min
90
91
  threads = 1.upto(thr_cnt).map.with_index { |_, i|
@@ -116,7 +117,7 @@ module TitleGrabber
116
117
  tweet_urls.compact!
117
118
  tweet_urls.uniq!
118
119
  tweet_urls.map! do |url|
119
- if res = open_w_timeout(url, **http_opts)
120
+ if url.match?(URL_RE) && (res = open_w_timeout(url, **http_opts))
120
121
  uri = res.uri
121
122
  uri.host == TWITTER_HOST && !uri.to_s.match?(TWITTER_STATUS_RE) ? nil : uri.to_s
122
123
  else
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: title_grabber
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Cristian Rasch
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-04-17 00:00:00.000000000 Z
11
+ date: 2019-04-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: http