nhkore 0.3.6 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 445adf6e8abd4da9fd6dd25e9632d5f477b467f6ce8c3dcecae87e3f61305d98
4
- data.tar.gz: ca812639ff1edd8da835f5bbb2cde403c9cb63e17568fb3ec367eec00605ec17
3
+ metadata.gz: 2b9464ae2a62f0c9cc797f2f70028d2b7afd6f0677a431c54e5453690175ca29
4
+ data.tar.gz: 577987179a9001926629f1efd8e8d39fb61ad62543d24ea1caa4f1fe063fd1a4
5
5
  SHA512:
6
- metadata.gz: 392607205c53aa2a5dfcde244e5fa6137483d216dc27becf06c76798209d2dcf328f17abee2026d795207d4e783a23fd108e615525445f52ca6442560600cd42
7
- data.tar.gz: 7a1219623b6645bbc633ba9c94e767dcf86be8852a7228c1d5ddd3936f61b884897f680369d4c9d9db5aba8ab4561048d59aed15cecf7ba05695c1957f31b0ea
6
+ metadata.gz: '09d90011d4d641ea54c9dd7ebc8fd95efc8f7e68211e4322c1f3294e15a21303147de1eea2532694d5a01caaaf3c73f9a5172479193113be86b5b7a9fd08b910'
7
+ data.tar.gz: 65512547e6ee13503b345402e2eb1ba799e492131975f518cc96576a684bc97d48efb0f9c22787518f8dc62998e44fba1089ca2a1687b98e0533a489091e61c1
@@ -0,0 +1,3 @@
1
+ --files 'CHANGELOG.md,LICENSE.txt'
2
+ --protected
3
+ --readme 'README.md'
@@ -2,7 +2,23 @@
2
2
 
3
3
  Format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
4
4
 
5
- ## [[Unreleased]](https://github.com/esotericpig/nhkore/compare/v0.3.6...master)
5
+ ## [[Unreleased]](https://github.com/esotericpig/nhkore/compare/v0.3.7...HEAD)
6
+
7
+ ## [v0.3.7] - 2020-11-07
8
+
9
+ ### Changed
10
+ - Updated Gem `attr_bool` to v0.2
11
+ - Changed upper-case *'-V'* flag for *version* to be a lower-case *'-v'*
12
+ - Seems like a lot of apps/people expect this
13
+ - Refactored/Formatted some code
14
+ - *nhkore.gemspec* especially
15
+ - Added *samples/*, *Gemfile.lock*, and *.yardopts* to the files in *nhkore.gemspec*
16
+
17
+ ### Fixed
18
+ - ArticleScraper
19
+ - Fixed to accept text nodes that have Kanji, due to bad article:
20
+ - https://www3.nhk.or.jp/news/easy/k10012639271000/k10012639271000.html
21
+ - `第3のビール` should have HTML ruby tags around *第*
6
22
 
7
23
  ## [v0.3.6] - 2020-08-18
8
24
 
@@ -0,0 +1,86 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ nhkore (0.3.7)
5
+ attr_bool (~> 0.2)
6
+ bimyou_segmenter (~> 1.2)
7
+ cri (~> 2.15)
8
+ down (~> 5.1)
9
+ highline (~> 2.0)
10
+ http-cookie (~> 1.0)
11
+ japanese_deinflector (~> 0.0)
12
+ nokogiri (~> 1.10)
13
+ psychgus (~> 1.3)
14
+ public_suffix (~> 4.0)
15
+ rainbow (~> 3.0)
16
+ rubyzip (~> 2.3)
17
+ tiny_segmenter (~> 0.0)
18
+ tty-progressbar (~> 0.17)
19
+ tty-spinner (~> 0.9)
20
+
21
+ GEM
22
+ remote: https://rubygems.org/
23
+ specs:
24
+ addressable (2.7.0)
25
+ public_suffix (>= 2.0.2, < 5.0)
26
+ attr_bool (0.2.1)
27
+ bimyou_segmenter (1.2.0)
28
+ cri (2.15.10)
29
+ domain_name (0.5.20190701)
30
+ unf (>= 0.0.5, < 1.0.0)
31
+ down (5.2.0)
32
+ addressable (~> 2.5)
33
+ highline (2.0.3)
34
+ http-cookie (1.0.3)
35
+ domain_name (~> 0.5)
36
+ japanese_deinflector (0.0.2)
37
+ mini_portile2 (2.4.0)
38
+ minitest (5.14.2)
39
+ nokogiri (1.10.10)
40
+ mini_portile2 (~> 2.4.0)
41
+ psych (3.2.0)
42
+ psychgus (1.3.3)
43
+ psych (>= 3.0)
44
+ public_suffix (4.0.6)
45
+ rainbow (3.0.0)
46
+ rake (13.0.1)
47
+ raketeer (0.2.9)
48
+ rake
49
+ rdoc (6.2.1)
50
+ redcarpet (3.5.0)
51
+ rubyzip (2.3.0)
52
+ strings-ansi (0.1.0)
53
+ tiny_segmenter (0.0.6)
54
+ tty-cursor (0.7.1)
55
+ tty-progressbar (0.17.0)
56
+ strings-ansi (~> 0.1.0)
57
+ tty-cursor (~> 0.7)
58
+ tty-screen (~> 0.7)
59
+ unicode-display_width (~> 1.6)
60
+ tty-screen (0.8.1)
61
+ tty-spinner (0.9.3)
62
+ tty-cursor (~> 0.7)
63
+ unf (0.1.4)
64
+ unf_ext
65
+ unf_ext (0.0.7.7)
66
+ unicode-display_width (1.7.0)
67
+ yard (0.9.25)
68
+ yard_ghurt (1.2.0)
69
+ rake
70
+
71
+ PLATFORMS
72
+ ruby
73
+
74
+ DEPENDENCIES
75
+ bundler (~> 2.1)
76
+ minitest (~> 5.14)
77
+ nhkore!
78
+ rake (~> 13.0)
79
+ raketeer (~> 0.2)
80
+ rdoc (~> 6.2)
81
+ redcarpet (~> 3.5)
82
+ yard (~> 0.9)
83
+ yard_ghurt (~> 1.2)
84
+
85
+ BUNDLED WITH
86
+ 2.1.4
@@ -246,8 +246,7 @@ module NHKore
246
246
  app.scraper_kargs[:header] ||= {}
247
247
  app.scraper_kargs[:header]['user-agent'] = value
248
248
  end
249
- # Big V, not small.
250
- flag :V,:version,'show the version and exit' do |value,cmd|
249
+ flag :v,:version,'show the version and exit' do |value,cmd|
251
250
  app.show_version()
252
251
  exit
253
252
  end
@@ -43,6 +43,8 @@ module NHKore
43
43
  # @since 0.2.0
44
44
  ###
45
45
  class ArticleScraper < Scraper
46
+ extend AttrBool::Ext
47
+
46
48
  attr_reader :cleaners
47
49
  attr_accessor :datetime
48
50
  attr_accessor :dict
@@ -463,7 +465,10 @@ module NHKore
463
465
  return nil
464
466
  end
465
467
 
466
- text = word.kana # Should be kana only
468
+ # Kanji only for:
469
+ # - https://www3.nhk.or.jp/news/easy/k10012639271000/k10012639271000.html
470
+ # - '第3のビール'
471
+ text = word.word # Should usually be kana only
467
472
 
468
473
  result.add_text(text) # No cleaning; raw text
469
474
 
@@ -472,8 +477,9 @@ module NHKore
472
477
  return nil if text.empty?() # No error; empty text is fine here
473
478
 
474
479
  word = Word.new(
475
- kana: text,
476
- word: word
480
+ kana: clean(word.kana),
481
+ kanji: clean(word.kanji),
482
+ word: word,
477
483
  )
478
484
 
479
485
  return word
@@ -34,6 +34,8 @@ module NHKore
34
34
  # @since 0.3.4
35
35
  ###
36
36
  class DatetimeParser
37
+ extend AttrBool::Ext
38
+
37
39
  # Order matters!
38
40
  FMTS = [
39
41
  '%Y-%m-%d %H:%M',
@@ -35,6 +35,8 @@ module NHKore
35
35
  # @since 0.2.0
36
36
  ###
37
37
  class Scraper
38
+ extend AttrBool::Ext
39
+
38
40
  DEFAULT_HEADER = {
39
41
  'user-agent' => UserAgents.sample(),
40
42
  'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp,image/apng,*/*;application/signed-exchange',
@@ -34,6 +34,8 @@ module NHKore
34
34
  # @since 0.2.0
35
35
  ###
36
36
  class SearchLink
37
+ extend AttrBool::Ext
38
+
37
39
  attr_reader :datetime
38
40
  attr_reader :futsuurl
39
41
  attr_accessor? :scraped
@@ -45,6 +45,16 @@ module NHKore
45
45
  # - https://www3.nhk.or.jp/news/easy/article/disaster_heat.html
46
46
  YASASHII_REGEX = /\A[^\.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i
47
47
 
48
+ IGNORE_LINK_REGEX = %r{
49
+ /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
50
+ |/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
51
+ |/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
52
+ |/news/easy/index\.html? # http://www3.nhk.or.jp/news/easy/index.html
53
+ # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
54
+ # https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
55
+ |/enqform\.html?
56
+ }x
57
+
48
58
  # Search Engines are strict, so trigger using the default HTTP header fields
49
59
  # with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
50
60
  def initialize(url,eat_cookie: true,header: {},**kargs)
@@ -57,11 +67,8 @@ module NHKore
57
67
  link = Util.unspace_web_str(link).downcase() unless cleaned
58
68
 
59
69
  return true if link.empty?()
60
- return true if link =~ /\/about\.html?/ # https://www3.nhk.or.jp/news/easy/about.html
61
- return true if link =~ /\/movieplayer\.html?/ # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
62
- return true if link =~ /\/audio\.html?/ # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
63
- return true if link =~ /\/news\/easy\/index\.html?/ # http://www3.nhk.or.jp/news/easy/index.html
64
- return true if link =~ /cgi2.*enqform/ # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
70
+
71
+ return true if IGNORE_LINK_REGEX.match?(link)
65
72
 
66
73
  return false
67
74
  end
@@ -22,5 +22,5 @@
22
22
 
23
23
 
24
24
  module NHKore
25
- VERSION = '0.3.6'
25
+ VERSION = '0.3.7'
26
26
  end
@@ -56,11 +56,15 @@ module NHKore
56
56
  if !unknown.nil?()
57
57
  # kanji?() only tests if it contains kanji, so don't use kana?().
58
58
  if Util.kanji?(unknown)
59
- raise ArgumentError,"unknown[#{unknown}] will overwrite kanji[#{kanji}]" unless Util.empty_web_str?(kanji)
59
+ if !Util.empty_web_str?(kanji)
60
+ raise ArgumentError,"unknown[#{unknown}] will overwrite kanji[#{kanji}]"
61
+ end
60
62
 
61
63
  kanji = unknown
62
64
  else
63
- raise ArgumentError,"unknown[#{unknown}] will overwrite kana[#{kana}]" unless Util.empty_web_str?(kana)
65
+ if !Util.empty_web_str?(kana)
66
+ raise ArgumentError,"unknown[#{unknown}] will overwrite kana[#{kana}]"
67
+ end
64
68
 
65
69
  kana = unknown
66
70
  end
@@ -25,7 +25,6 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
25
25
 
26
26
  require 'nhkore/version'
27
27
 
28
-
29
28
  Gem::Specification.new() do |spec|
30
29
  spec.name = 'nhkore'
31
30
  spec.version = NHKore::VERSION
@@ -39,31 +38,33 @@ Gem::Specification.new() do |spec|
39
38
  ' Includes a CLI app and a scraper library.'
40
39
 
41
40
  spec.metadata = {
42
- 'bug_tracker_uri' => 'https://github.com/esotericpig/nhkore/issues',
43
- 'changelog_uri' => 'https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md',
44
- 'homepage_uri' => 'https://github.com/esotericpig/nhkore',
45
- 'source_code_uri' => 'https://github.com/esotericpig/nhkore'
41
+ 'homepage_uri' => 'https://github.com/esotericpig/nhkore',
42
+ 'source_code_uri' => 'https://github.com/esotericpig/nhkore',
43
+ 'bug_tracker_uri' => 'https://github.com/esotericpig/nhkore/issues',
44
+ 'changelog_uri' => 'https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md',
45
+ #'documentation_uri' => '',
46
+ #'wiki_uri' => '',
47
+ #'mailing_list_uri' => '',
46
48
  }
47
49
 
48
- spec.require_paths = ['lib']
49
- spec.bindir = 'bin'
50
- spec.executables = [spec.name]
50
+ spec.requirements = [
51
+ 'Nokogiri: https://www.nokogiri.org/tutorials/installing_nokogiri.html',
52
+ ]
53
+
54
+ spec.required_ruby_version = '>= 2.4'
55
+ spec.require_paths = ['lib']
56
+ spec.bindir = 'bin'
57
+ spec.executables = [spec.name]
51
58
 
52
59
  spec.files = [
53
60
  Dir.glob(File.join("{#{spec.require_paths.join(',')}}",'**','*.{erb,rb}')),
54
61
  Dir.glob(File.join(spec.bindir,'*')),
55
- Dir.glob(File.join('{test,yard}','**','*.{erb,rb}')),
56
- %W( Gemfile #{spec.name}.gemspec Rakefile ),
57
- %w( CHANGELOG.md LICENSE.txt README.md ),
62
+ Dir.glob(File.join('{samples,test,yard}','**','*.{erb,rb}')),
63
+ %W[ Gemfile Gemfile.lock #{spec.name}.gemspec Rakefile .yardopts ],
64
+ %w[ CHANGELOG.md LICENSE.txt README.md ],
58
65
  ].flatten()
59
66
 
60
- spec.required_ruby_version = '>= 2.4'
61
-
62
- spec.requirements = [
63
- 'Nokogiri: https://www.nokogiri.org/tutorials/installing_nokogiri.html',
64
- ]
65
-
66
- spec.add_runtime_dependency 'attr_bool' ,'~> 0.1' # For attr_accessor?/attr_reader?
67
+ spec.add_runtime_dependency 'attr_bool' ,'~> 0.2' # For attr_accessor?/attr_reader?
67
68
  spec.add_runtime_dependency 'bimyou_segmenter' ,'~> 1.2' # For splitting Japanese sentences into words
68
69
  spec.add_runtime_dependency 'cri' ,'~> 2.15' # For CLI commands/options
69
70
  spec.add_runtime_dependency 'down' ,'~> 5.1' # For downloading files (GetCmd)
@@ -97,8 +98,17 @@ Gem::Specification.new() do |spec|
97
98
  Homepage: #{spec.homepage}
98
99
 
99
100
  Code: #{spec.metadata['source_code_uri']}
100
- Changelog: #{spec.metadata['changelog_uri']}
101
101
  Bugs: #{spec.metadata['bug_tracker_uri']}
102
102
 
103
+ Changelog: #{spec.metadata['changelog_uri']}
104
+
103
105
  EOM
106
+
107
+ spec.extra_rdoc_files = %w[ CHANGELOG.md LICENSE.txt README.md ]
108
+
109
+ spec.rdoc_options = [
110
+ '--hyperlink-all','--show-hash',
111
+ '--title',"NHKore v#{NHKore::VERSION} Doc",
112
+ '--main','README.md',
113
+ ]
104
114
  end
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ ###
25
+ # If you run this script, be aware that it uses the +-F+ force option
26
+ # (which overwrites files without prompting).
27
+ #
28
+ # @author Jonathan Bradley Whited (@esotericpig)
29
+ # @since 0.3.2
30
+ ###
31
+
32
+ case ARGV[0]
33
+ when '-c' # count
34
+ system('nhkore search ez --show-count')
35
+ puts
36
+ puts %q{Use the first number with the '-a' option.}
37
+ exit
38
+ when '-a' # articles
39
+ articles = ARGV[1].to_i()
40
+ articles = 0 if articles < 0
41
+ else
42
+ puts 'Options:'
43
+ puts ' -c show count to use with -a'
44
+ puts ' -a <int> number of articles already scraped; execute scraping'
45
+ exit
46
+ end
47
+
48
+ articles_inc = 25
49
+ max_errors = 5 # Exit, for example, if 404 errors repeatedly
50
+ max_loop = 5 # Possible total = articles_inc * max_loop
51
+
52
+ thread = Thread.new() do
53
+ i = 0
54
+
55
+ while i < max_loop
56
+ puts "Loop #{i += 1} => #{articles} articles"
57
+
58
+ if system("nhkore -F -t 300 -m 10 news ez -s #{articles_inc}")
59
+ articles += articles_inc
60
+ else
61
+ break if (max_errors -= 1) <= 0
62
+ end
63
+
64
+ puts
65
+ end
66
+ end
67
+
68
+ # Ctrl+C
69
+ trap('INT') do
70
+ if thread.alive?()
71
+ # Try to exit gracefully.
72
+ max_loop = -1
73
+ thread.join(5)
74
+
75
+ # Die!
76
+ thread.kill() if thread.alive?()
77
+ end
78
+
79
+ exit
80
+ end
81
+
82
+ thread.join() # Run
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nhkore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.3.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Bradley Whited (@esotericpig)
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-18 00:00:00.000000000 Z
11
+ date: 2020-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: attr_bool
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0.1'
19
+ version: '0.2'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0.1'
26
+ version: '0.2'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bimyou_segmenter
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -339,10 +339,15 @@ email:
339
339
  executables:
340
340
  - nhkore
341
341
  extensions: []
342
- extra_rdoc_files: []
342
+ extra_rdoc_files:
343
+ - CHANGELOG.md
344
+ - LICENSE.txt
345
+ - README.md
343
346
  files:
347
+ - ".yardopts"
344
348
  - CHANGELOG.md
345
349
  - Gemfile
350
+ - Gemfile.lock
346
351
  - LICENSE.txt
347
352
  - README.md
348
353
  - Rakefile
@@ -379,6 +384,7 @@ files:
379
384
  - lib/nhkore/version.rb
380
385
  - lib/nhkore/word.rb
381
386
  - nhkore.gemspec
387
+ - samples/looper.rb
382
388
  - test/nhkore/test_helper.rb
383
389
  - test/nhkore_test.rb
384
390
  - yard/templates/default/layout/html/footer.erb
@@ -386,15 +392,22 @@ homepage: https://github.com/esotericpig/nhkore
386
392
  licenses:
387
393
  - LGPL-3.0-or-later
388
394
  metadata:
389
- bug_tracker_uri: https://github.com/esotericpig/nhkore/issues
390
- changelog_uri: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md
391
395
  homepage_uri: https://github.com/esotericpig/nhkore
392
396
  source_code_uri: https://github.com/esotericpig/nhkore
393
- post_install_message: " \n NHKore v0.3.6\n \n You can now use [nhkore] on the
397
+ bug_tracker_uri: https://github.com/esotericpig/nhkore/issues
398
+ changelog_uri: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md
399
+ post_install_message: " \n NHKore v0.3.7\n \n You can now use [nhkore] on the
394
400
  command line.\n \n Homepage: https://github.com/esotericpig/nhkore\n \n Code:
395
- \ https://github.com/esotericpig/nhkore\n Changelog: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md\n
396
- \ Bugs: https://github.com/esotericpig/nhkore/issues\n \n"
397
- rdoc_options: []
401
+ \ https://github.com/esotericpig/nhkore\n Bugs: https://github.com/esotericpig/nhkore/issues\n
402
+ \ \n Changelog: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md\n
403
+ \ \n"
404
+ rdoc_options:
405
+ - "--hyperlink-all"
406
+ - "--show-hash"
407
+ - "--title"
408
+ - NHKore v0.3.7 Doc
409
+ - "--main"
410
+ - README.md
398
411
  require_paths:
399
412
  - lib
400
413
  required_ruby_version: !ruby/object:Gem::Requirement