title_grabber 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 96fbc07b812b570e0827c2d6b7b2d18f7b9ad0b4a172b69606648bb21d7e41b6
4
+ data.tar.gz: 72c04ad3f1149e44bc0a16a8e7d6c0c5d13be794e026c3748415f96197c5d146
5
+ SHA512:
6
+ metadata.gz: c88bdf88fe2dd0bce4bde4ad97d5432e7d5433a4a581372c938500fa7323d0020437c5326e19e3f7248cebe02123c93dfd00cb498669ae363a33d2267f28f90f
7
+ data.tar.gz: f9901cb587973ed762a2ad100e9410819a36dfe34e61c54daac78ad6ff72284faeddcd1f8f27594c25467622e3882670473931367d43efa68a026e7151f2bba8
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+ bin/console
10
+ bin/setup
11
+ *.csv
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ 2.6.2
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in title_grabber.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,49 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ title_grabber (0.2.0)
5
+ http (~> 4.1)
6
+ oga (~> 2.15)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ addressable (2.6.0)
12
+ public_suffix (>= 2.0.2, < 4.0)
13
+ ansi (1.5.0)
14
+ ast (2.4.0)
15
+ domain_name (0.5.20180417)
16
+ unf (>= 0.0.5, < 1.0.0)
17
+ http (4.1.1)
18
+ addressable (~> 2.3)
19
+ http-cookie (~> 1.0)
20
+ http-form_data (~> 2.0)
21
+ http_parser.rb (~> 0.6.0)
22
+ http-cookie (1.0.3)
23
+ domain_name (~> 0.5)
24
+ http-form_data (2.1.1)
25
+ http_parser.rb (0.6.0)
26
+ minitest (5.11.3)
27
+ oga (2.15)
28
+ ast
29
+ ruby-ll (~> 2.1)
30
+ public_suffix (3.0.3)
31
+ rake (10.5.0)
32
+ ruby-ll (2.1.2)
33
+ ansi
34
+ ast
35
+ unf (0.1.4)
36
+ unf_ext
37
+ unf_ext (0.0.7.5)
38
+
39
+ PLATFORMS
40
+ ruby
41
+
42
+ DEPENDENCIES
43
+ bundler (~> 1.17)
44
+ minitest (~> 5.0)
45
+ rake (~> 10.0)
46
+ title_grabber!
47
+
48
+ BUNDLED WITH
49
+ 1.17.2
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2019 Cristian Rasch
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,48 @@
1
+ # TitleGrabber
2
+
3
+ Grab page & article titles from lists of URLs contained in files passed in as arguments
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'title_grabber'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install title_grabber
20
+
21
+ ## Usage
22
+
23
+ Just pass it a list of files containing URLs (one per line)
24
+
25
+ ```
26
+ title-grabber /abs/path/2/file1.txt rel/path/2/file2.txt
27
+ ```
28
+
29
+ Data is either recorded to out.csv in the CWD or the file specified using the
30
+ -o/--output argument, e.g.
31
+
32
+ ```
33
+ title-grabber -o ~/output.csv /abs/path/2/file1.txt rel/path/2/file2.txt
34
+ ```
35
+
36
+ ## Development
37
+
38
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
39
+
40
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
41
+
42
+ ## Contributing
43
+
44
+ Bug reports and pull requests are welcome on GitHub at https://github.com/cristian-rasch/title_grabber.
45
+
46
+ ## License
47
+
48
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new(:test) do |t|
5
+ t.libs << "test"
6
+ t.libs << "lib"
7
+ t.test_files = FileList["test/**/*_test.rb"]
8
+ end
9
+
10
+ task :default => :test
data/exe/title-grabber ADDED
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "optparse"
4
+ require "pathname"
5
+
6
+ require_relative '../lib/title_grabber'
7
+
8
+ script_path = Pathname(__FILE__)
9
+ def_out_path = Pathname('out.csv')
10
+
11
+ arguments = { output: def_out_path }
12
+ OptionParser.new do |args|
13
+ args.banner = "Usage: #{script_path.basename} [options]"
14
+
15
+ args.on("-o", "--output FILE", "Output file (defaults to #{def_out_path.basename})") do |out|
16
+ arguments[:output] = Pathname(out)
17
+ end
18
+ end.parse!
19
+
20
+ if ARGV.empty?
21
+ print "At least 1 input file is required!\n"
22
+ exit(1)
23
+ else
24
+ TitleGrabber.call(ARGF, arguments)
25
+ end
@@ -0,0 +1,66 @@
1
+ require "logger"
2
+
3
+ require "http"
4
+
5
+ require_relative "text_helper"
6
+
7
+ module HTTPHelper
8
+ WRITE_TO = Integer(ENV.fetch("WRITE_TIMEOUT", 5))
9
+ CONN_TO = Integer(ENV.fetch("CONNECT_TIMEOUT", 10))
10
+ READ_TO = Integer(ENV.fetch("READ_TIMEOUT", 15))
11
+ MAX_HOPS = 5
12
+ MAX_RETRIES = 3
13
+ INVALID_BYTE_SEQ = "invalid byte sequence".freeze
14
+ CONNECTION_ERRORS = ["SSL_connect", "Connection reset"].freeze
15
+
16
+ include TextHelper
17
+
18
+ def open_w_timeout(url, write_to: WRITE_TO, connect_to: CONN_TO,
19
+ read_to: READ_TO)
20
+ logger.info "GET #{url}"
21
+ retries = 0
22
+
23
+ begin
24
+ body = Timeout.timeout(read_to) {
25
+ HTTP.timeout(write: write_to, connect: connect_to, read: read_to).
26
+ follow(max_hops: MAX_HOPS).
27
+ get(url, ssl_context: ssl_ctx).
28
+ to_s
29
+ }
30
+ rescue HTTP::Error, Timeout::Error => err
31
+ msg = err.message
32
+
33
+ if err.kind_of?(HTTP::Error) || err.kind_of?(Timeout::Error) ||
34
+ CONNECTION_ERRORS.any? { |e| msg.start_with?(e) }
35
+ retries += 1
36
+
37
+ if retries <= MAX_RETRIES
38
+ logger.warn "URL: #{url} [#{msg}] - Retry ##{retries}"
39
+ retry
40
+ else
41
+ logger.error "URL: #{url} [#{msg}]"
42
+ nil
43
+ end
44
+ end
45
+ rescue => err
46
+ logger.error "URL: #{url} [#{err.message}]"
47
+ nil
48
+ else
49
+ utf8_encode(body)
50
+ end
51
+ end
52
+
53
+ private
54
+
55
+ def logger
56
+ @logger ||= Logger.new(STDOUT)
57
+ end
58
+
59
+ def ssl_ctx
60
+ @ssl_ctx ||= begin
61
+ ctx = OpenSSL::SSL::SSLContext.new
62
+ ctx.verify_mode = OpenSSL::SSL::VERIFY_NONE
63
+ ctx
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,18 @@
1
+ module TextHelper
2
+ def utf8_encode(text = nil)
3
+ begin
4
+ String(text).encode(-"UTF-8", invalid: :replace, undef: :replace,
5
+ replace: -"")
6
+ rescue EncodingError
7
+ -""
8
+ end
9
+ end
10
+
11
+ # document.querySelector('title').textContent.trim().replace(/\n/g, ' ').replace(/\s{2,}/g, ' ')
12
+ def clean_up_whitespace(text)
13
+ text.strip!
14
+ text.gsub!("\n", " ")
15
+ text.gsub(/\s{2,}/, ' ')
16
+ text
17
+ end
18
+ end
@@ -0,0 +1,3 @@
1
+ module TitleGrabber
2
+ VERSION = "0.2.0"
3
+ end
@@ -0,0 +1,95 @@
1
+ require "csv"
2
+ require "etc"
3
+ require "fileutils"
4
+
5
+ require "bundler/setup"
6
+ require "oga"
7
+
8
+ require_relative "title_grabber/version"
9
+ require_relative "http_helper"
10
+ require_relative "text_helper"
11
+
12
+ module TitleGrabber
13
+ class Error < StandardError; end
14
+
15
+ URL_RE = %r(https?://\S+)i
16
+ URL_HEADER = -"url"
17
+ PAGE_TIT_HEAD = -"page_title"
18
+ ART_TIT_HEAD = -"article_title"
19
+ HEADERS = [URL_HEADER, PAGE_TIT_HEAD, ART_TIT_HEAD].freeze
20
+
21
+ class << self
22
+ include HTTPHelper
23
+ include TextHelper
24
+
25
+ def call(lines, options)
26
+ tmp_path = nil
27
+ processed_urls = if (out_path = options[:output]).exist?
28
+ tmp_path = out_path.sub_ext(".tmp#{out_path.extname}")
29
+ FileUtils.cp(out_path, tmp_path)
30
+ arr_of_h = CSV.read(tmp_path, headers: true)
31
+ arr_of_h.each_with_object({}) { |r, h|
32
+ page_tit = r[PAGE_TIT_HEAD]
33
+ art_tit = r[ART_TIT_HEAD]
34
+
35
+ unless page_tit.empty? && art_tit.empty?
36
+ h[r[URL_HEADER]] = { PAGE_TIT_HEAD => page_tit,
37
+ ART_TIT_HEAD => art_tit }
38
+ end
39
+ }.tap do
40
+ arr_of_h = nil
41
+ end
42
+ else
43
+ {}
44
+ end
45
+
46
+ queue = Queue.new
47
+
48
+ CSV.open(options[:output], "w", force_quotes: true) do |csv|
49
+ csv << HEADERS
50
+
51
+ lines.each do |line|
52
+ md = line.match(URL_RE)
53
+ next unless md
54
+
55
+ url = md.to_s
56
+ if h = processed_urls[url]
57
+ csv << [url, h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
58
+ next
59
+ end
60
+
61
+ queue << url
62
+ end
63
+ lines = nil
64
+
65
+ thr_cnt = [Etc.nprocessors * 2, queue.size].min
66
+ threads = 1.upto(thr_cnt).map {
67
+ Thread.new do
68
+ url = begin
69
+ queue.pop(true)
70
+ rescue ThreadError; end
71
+
72
+ while url
73
+ if html = open_w_timeout(url)
74
+ doc = Oga.parse_html(html)
75
+ page_title = doc.at_css('title')&.text || -""
76
+ clean_up_whitespace(page_title) unless page_title.empty?
77
+ article_title = doc.at_css('article h1')&.text
78
+ article_title ||= doc.at_css('h1')&.text || -""
79
+ clean_up_whitespace(article_title) unless article_title.empty?
80
+
81
+ csv << [url, page_title, article_title]
82
+ end
83
+
84
+ url = begin
85
+ queue.pop(true)
86
+ rescue ThreadError; end
87
+ end
88
+ end
89
+ }.each(&:join)
90
+ end
91
+
92
+ tmp_path&.unlink
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,45 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "title_grabber/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "title_grabber"
8
+ spec.version = TitleGrabber::VERSION
9
+ spec.authors = ["Cristian Rasch"]
10
+ spec.email = ["cristianrasch@fastmail.fm"]
11
+
12
+ spec.summary = %q{Grabs page & article titles from lists of URLs contained in files passed in as arguments}
13
+ # spec.description = %q{TODO: Write a longer description or delete this line.}
14
+ spec.homepage = "https://bitbucket.org/cristian-rasch/title_grabber"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ if spec.respond_to?(:metadata)
20
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
21
+
22
+ spec.metadata["homepage_uri"] = spec.homepage
23
+ spec.metadata["source_code_uri"] = "https://bitbucket.org/cristian-rasch/title_grabber/src/master/"
24
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
25
+ else
26
+ raise "RubyGems 2.0 or newer is required to protect against " \
27
+ "public gem pushes."
28
+ end
29
+
30
+ # Specify which files should be added to the gem when it is released.
31
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
32
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
33
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
34
+ end
35
+ spec.bindir = "exe"
36
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
37
+ spec.require_paths = ["lib"]
38
+
39
+ spec.add_runtime_dependency "http", "~> 4.1"
40
+ spec.add_runtime_dependency "oga", "~> 2.15"
41
+
42
+ spec.add_development_dependency "bundler", "~> 1.17"
43
+ spec.add_development_dependency "rake", "~> 10.0"
44
+ spec.add_development_dependency "minitest", "~> 5.0"
45
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: title_grabber
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.0
5
+ platform: ruby
6
+ authors:
7
+ - Cristian Rasch
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2019-04-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: http
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '4.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '4.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: oga
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.15'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.15'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.17'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.17'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '10.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '10.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: minitest
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '5.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '5.0'
83
+ description:
84
+ email:
85
+ - cristianrasch@fastmail.fm
86
+ executables:
87
+ - title-grabber
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - ".gitignore"
92
+ - ".ruby-version"
93
+ - Gemfile
94
+ - Gemfile.lock
95
+ - LICENSE.txt
96
+ - README.md
97
+ - Rakefile
98
+ - exe/title-grabber
99
+ - lib/http_helper.rb
100
+ - lib/text_helper.rb
101
+ - lib/title_grabber.rb
102
+ - lib/title_grabber/version.rb
103
+ - title_grabber.gemspec
104
+ homepage: https://bitbucket.org/cristian-rasch/title_grabber
105
+ licenses:
106
+ - MIT
107
+ metadata:
108
+ homepage_uri: https://bitbucket.org/cristian-rasch/title_grabber
109
+ source_code_uri: https://bitbucket.org/cristian-rasch/title_grabber/src/master/
110
+ post_install_message:
111
+ rdoc_options: []
112
+ require_paths:
113
+ - lib
114
+ required_ruby_version: !ruby/object:Gem::Requirement
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ version: '0'
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: '0'
124
+ requirements: []
125
+ rubygems_version: 3.0.3
126
+ signing_key:
127
+ specification_version: 4
128
+ summary: Grabs page & article titles from lists of URLs contained in files passed
129
+ in as arguments
130
+ test_files: []