title_grabber 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 96fbc07b812b570e0827c2d6b7b2d18f7b9ad0b4a172b69606648bb21d7e41b6
4
+ data.tar.gz: 72c04ad3f1149e44bc0a16a8e7d6c0c5d13be794e026c3748415f96197c5d146
5
+ SHA512:
6
+ metadata.gz: c88bdf88fe2dd0bce4bde4ad97d5432e7d5433a4a581372c938500fa7323d0020437c5326e19e3f7248cebe02123c93dfd00cb498669ae363a33d2267f28f90f
7
+ data.tar.gz: f9901cb587973ed762a2ad100e9410819a36dfe34e61c54daac78ad6ff72284faeddcd1f8f27594c25467622e3882670473931367d43efa68a026e7151f2bba8
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ bin/console
10
+ bin/setup
11
+ *.csv
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.6.2
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in title_grabber.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,49 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ title_grabber (0.2.0)
5
+ http (~> 4.1)
6
+ oga (~> 2.15)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ addressable (2.6.0)
12
+ public_suffix (>= 2.0.2, < 4.0)
13
+ ansi (1.5.0)
14
+ ast (2.4.0)
15
+ domain_name (0.5.20180417)
16
+ unf (>= 0.0.5, < 1.0.0)
17
+ http (4.1.1)
18
+ addressable (~> 2.3)
19
+ http-cookie (~> 1.0)
20
+ http-form_data (~> 2.0)
21
+ http_parser.rb (~> 0.6.0)
22
+ http-cookie (1.0.3)
23
+ domain_name (~> 0.5)
24
+ http-form_data (2.1.1)
25
+ http_parser.rb (0.6.0)
26
+ minitest (5.11.3)
27
+ oga (2.15)
28
+ ast
29
+ ruby-ll (~> 2.1)
30
+ public_suffix (3.0.3)
31
+ rake (10.5.0)
32
+ ruby-ll (2.1.2)
33
+ ansi
34
+ ast
35
+ unf (0.1.4)
36
+ unf_ext
37
+ unf_ext (0.0.7.5)
38
+
39
+ PLATFORMS
40
+ ruby
41
+
42
+ DEPENDENCIES
43
+ bundler (~> 1.17)
44
+ minitest (~> 5.0)
45
+ rake (~> 10.0)
46
+ title_grabber!
47
+
48
+ BUNDLED WITH
49
+ 1.17.2
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 Cristian Rasch
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,48 @@
1
+ # TitleGrabber
2
+
3
+ Grab page & article titles from lists of URLs contained in files passed in as arguments
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'title_grabber'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install title_grabber
20
+
21
+ ## Usage
22
+
23
+ Just pass it a list of files containing URLs (one per line)
24
+
25
+ ```
26
+ title-grabber /abs/path/2/file1.txt rel/path/2/file2.txt
27
+ ```
28
+
29
+ Data is either recorded to out.csv in the CWD or the file specified using the
30
+ -o/--output argument, e.g.
31
+
32
+ ```
33
+ title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
34
+ ```
35
+
36
+ ## Development
37
+
38
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
39
+
40
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
41
+
42
+ ## Contributing
43
+
44
+ Bug reports and pull requests are welcome on GitHub at https://github.com/cristian-rasch/title_grabber.
45
+
46
+ ## License
47
+
48
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :default => :test
data/exe/title-grabber ADDED
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "optparse"
4
+ require "pathname"
5
+
6
+ require_relative '../lib/title_grabber'
7
+
8
+ script_path = Pathname(__FILE__)
9
+ def_out_path = Pathname('out.csv')
10
+
11
+ arguments = { output: def_out_path }
12
+ OptionParser.new do |args|
13
+ args.banner = "Usage: #{script_path.basename} [options]"
14
+
15
+ args.on("-o", "--output FILE", "Output file (defaults to #{def_out_path.basename})") do |out|
16
+ arguments[:output] = Pathname(out)
17
+ end
18
+ end.parse!
19
+
20
+ if ARGV.empty?
21
+ print "At least 1 input file is required!\n"
22
+ exit(1)
23
+ else
24
+ TitleGrabber.call(ARGF, arguments)
25
+ end
@@ -0,0 +1,66 @@
1
+ require "logger"
2
+
3
+ require "http"
4
+
5
+ require_relative "text_helper"
6
+
7
+ module HTTPHelper
8
+ WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 5))
9
+ CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 10))
10
+ READ_TO = Integer(ENV.fetch("READ_TIMEOUT", 15))
11
+ MAX_HOPS = 5
12
+ MAX_RETRIES = 3
13
+ INVALID_BYTE_SEQ = "invalid byte sequence".freeze
14
+ CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
15
+
16
+ include TextHelper
17
+
18
+ def open_w_timeout(url, write_to: WRITE_TO, connect_to: CONN_TO,
19
+ read_to: READ_TO)
20
+ logger.info "GET #{url}"
21
+ retries = 0
22
+
23
+ begin
24
+ body = Timeout.timeout(read_to) {
25
+ HTTP.timeout(write: write_to, connect: connect_to, read: read_to).
26
+ follow(max_hops: MAX_HOPS).
27
+ get(url, ssl_context: ssl_ctx).
28
+ to_s
29
+ }
30
+ rescue HTTP::Error, Timeout::Error => err
31
+ msg = err.message
32
+
33
+ if err.kind_of?(HTTP::Error) || err.kind_of?(Timeout::Error) ||
34
+ CONNECTION_ERRORS.any? { |e| msg.start_with?(e) }
35
+ retries += 1
36
+
37
+ if retries <= MAX_RETRIES
38
+ logger.warn "URL: #{url} [#{msg}] - Retry ##{retries}"
39
+ retry
40
+ else
41
+ logger.error "URL: #{url} [#{msg}]"
42
+ nil
43
+ end
44
+ end
45
+ rescue => err
46
+ logger.error "URL: #{url} [#{err.message}]"
47
+ nil
48
+ else
49
+ utf8_encode(body)
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def logger
56
+ @logger ||= Logger.new(STDOUT)
57
+ end
58
+
59
+ def ssl_ctx
60
+ @ssl_ctx ||= begin
61
+ ctx = OpenSSL::SSL::SSLContext.new
62
+ ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
63
+ ctx
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,18 @@
1
+ module TextHelper
2
+ def utf8_encode(text = nil)
3
+ begin
4
+ String(text).encode(-"UTF-8", invalid: :replace, undef: :replace,
5
+ replace: -"")
6
+ rescue EncodingError
7
+ -""
8
+ end
9
+ end
10
+
11
+ # document.querySelector('title').textContent.trim().replace(/\n/g, ' ').replace(/\s{2,}/g, ' ')
12
+ def clean_up_whitespace(text)
13
+ text.strip!
14
+ text.gsub!("\n", " ")
15
+ text.gsub(/\s{2,}/, ' ')
16
+ text
17
+ end
18
+ end
@@ -0,0 +1,3 @@
1
+ module TitleGrabber
2
+ VERSION = "0.2.0"
3
+ end
@@ -0,0 +1,95 @@
1
+ require "csv"
2
+ require "etc"
3
+ require "fileutils"
4
+
5
+ require "bundler/setup"
6
+ require "oga"
7
+
8
+ require_relative "title_grabber/version"
9
+ require_relative "http_helper"
10
+ require_relative "text_helper"
11
+
12
+ module TitleGrabber
13
+ class Error < StandardError; end
14
+
15
+ URL_RE = %r(https?://\S+)i
16
+ URL_HEADER = -"url"
17
+ PAGE_TIT_HEAD = -"page_title"
18
+ ART_TIT_HEAD = -"article_title"
19
+ HEADERS = [URL_HEADER, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
20
+
21
+ class << self
22
+ include HTTPHelper
23
+ include TextHelper
24
+
25
+ def call(lines, options)
26
+ tmp_path = nil
27
+ processed_urls = if (out_path = options[:output]).exist?
28
+ tmp_path = out_path.sub_ext(".tmp#{out_path.extname}")
29
+ FileUtils.cp(out_path, tmp_path)
30
+ arr_of_h = CSV.read(tmp_path, headers: true)
31
+ arr_of_h.each_with_object({}) { |r, h|
32
+ page_tit = r[PAGE_TIT_HEAD]
33
+ art_tit = r[ART_TIT_HEAD]
34
+
35
+ unless page_tit.empty? && art_tit.empty?
36
+ h[r[URL_HEADER]] = { PAGE_TIT_HEAD => page_tit,
37
+ ART_TIT_HEAD => art_tit }
38
+ end
39
+ }.tap do
40
+ arr_of_h = nil
41
+ end
42
+ else
43
+ {}
44
+ end
45
+
46
+ queue = Queue.new
47
+
48
+ CSV.open(options[:output], "w", force_quotes: true) do |csv|
49
+ csv << HEADERS
50
+
51
+ lines.each do |line|
52
+ md = line.match(URL_RE)
53
+ next unless md
54
+
55
+ url = md.to_s
56
+ if h = processed_urls[url]
57
+ csv << [url, h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
58
+ next
59
+ end
60
+
61
+ queue << url
62
+ end
63
+ lines = nil
64
+
65
+ thr_cnt = [Etc.nprocessors * 2, queue.size].min
66
+ threads = 1.upto(thr_cnt).map {
67
+ Thread.new do
68
+ url = begin
69
+ queue.pop(true)
70
+ rescue ThreadError; end
71
+
72
+ while url
73
+ if html = open_w_timeout(url)
74
+ doc = Oga.parse_html(html)
75
+ page_title = doc.at_css('title')&.text || -""
76
+ clean_up_whitespace(page_title) unless page_title.empty?
77
+ article_title = doc.at_css('article h1')&.text
78
+ article_title ||= doc.at_css('h1')&.text || -""
79
+ clean_up_whitespace(article_title) unless article_title.empty?
80
+
81
+ csv << [url, page_title, article_title]
82
+ end
83
+
84
+ url = begin
85
+ queue.pop(true)
86
+ rescue ThreadError; end
87
+ end
88
+ end
89
+ }.each(&:join)
90
+ end
91
+
92
+ tmp_path&.unlink
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,45 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "title_grabber/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "title_grabber"
8
+ spec.version = TitleGrabber::VERSION
9
+ spec.authors = ["Cristian Rasch"]
10
+ spec.email = ["cristianrasch@fastmail.fm"]
11
+
12
+ spec.summary = %q{Grabs page & article titles from lists of URLs contained in files passed in as arguments}
13
+ # spec.description = %q{TODO: Write a longer description or delete this line.}
14
+ spec.homepage = "https://bitbucket.org/cristian-rasch/title_grabber"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ if spec.respond_to?(:metadata)
20
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
21
+
22
+ spec.metadata["homepage_uri"] = spec.homepage
23
+ spec.metadata["source_code_uri"] = "https://bitbucket.org/cristian-rasch/title_grabber/src/master/"
24
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
25
+ else
26
+ raise "RubyGems 2.0 or newer is required to protect against " \
27
+ "public gem pushes."
28
+ end
29
+
30
+ # Specify which files should be added to the gem when it is released.
31
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
32
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
33
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
34
+ end
35
+ spec.bindir = "exe"
36
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
37
+ spec.require_paths = ["lib"]
38
+
39
+ spec.add_runtime_dependency "http", "~> 4.1"
40
+ spec.add_runtime_dependency "oga", "~> 2.15"
41
+
42
+ spec.add_development_dependency "bundler", "~> 1.17"
43
+ spec.add_development_dependency "rake", "~> 10.0"
44
+ spec.add_development_dependency "minitest", "~> 5.0"
45
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: title_grabber
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Cristian Rasch
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2019-04-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: http
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '4.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '4.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: oga
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.15'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.15'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.17'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.17'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: minitest
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '5.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '5.0'
83
+ description:
84
+ email:
85
+ - cristianrasch@fastmail.fm
86
+ executables:
87
+ - title-grabber
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".gitignore"
92
+ - ".ruby-version"
93
+ - Gemfile
94
+ - Gemfile.lock
95
+ - LICENSE.txt
96
+ - README.md
97
+ - Rakefile
98
+ - exe/title-grabber
99
+ - lib/http_helper.rb
100
+ - lib/text_helper.rb
101
+ - lib/title_grabber.rb
102
+ - lib/title_grabber/version.rb
103
+ - title_grabber.gemspec
104
+ homepage: https://bitbucket.org/cristian-rasch/title_grabber
105
+ licenses:
106
+ - MIT
107
+ metadata:
108
+ homepage_uri: https://bitbucket.org/cristian-rasch/title_grabber
109
+ source_code_uri: https://bitbucket.org/cristian-rasch/title_grabber/src/master/
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubygems_version: 3.0.3
126
+ signing_key:
127
+ specification_version: 4
128
+ summary: Grabs page & article titles from lists of URLs contained in files passed
129
+ in as arguments
130
+ test_files: []