nhkore 0.3.6 → 0.3.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 445adf6e8abd4da9fd6dd25e9632d5f477b467f6ce8c3dcecae87e3f61305d98
4
- data.tar.gz: ca812639ff1edd8da835f5bbb2cde403c9cb63e17568fb3ec367eec00605ec17
3
+ metadata.gz: 2b9464ae2a62f0c9cc797f2f70028d2b7afd6f0677a431c54e5453690175ca29
4
+ data.tar.gz: 577987179a9001926629f1efd8e8d39fb61ad62543d24ea1caa4f1fe063fd1a4
5
5
  SHA512:
6
- metadata.gz: 392607205c53aa2a5dfcde244e5fa6137483d216dc27becf06c76798209d2dcf328f17abee2026d795207d4e783a23fd108e615525445f52ca6442560600cd42
7
- data.tar.gz: 7a1219623b6645bbc633ba9c94e767dcf86be8852a7228c1d5ddd3936f61b884897f680369d4c9d9db5aba8ab4561048d59aed15cecf7ba05695c1957f31b0ea
6
+ metadata.gz: '09d90011d4d641ea54c9dd7ebc8fd95efc8f7e68211e4322c1f3294e15a21303147de1eea2532694d5a01caaaf3c73f9a5172479193113be86b5b7a9fd08b910'
7
+ data.tar.gz: 65512547e6ee13503b345402e2eb1ba799e492131975f518cc96576a684bc97d48efb0f9c22787518f8dc62998e44fba1089ca2a1687b98e0533a489091e61c1
@@ -0,0 +1,3 @@
1
+ --files 'CHANGELOG.md,LICENSE.txt'
2
+ --protected
3
+ --readme 'README.md'
@@ -2,7 +2,23 @@
2
2
 
3
3
  Format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
4
4
 
5
- ## [[Unreleased]](https://github.com/esotericpig/nhkore/compare/v0.3.6...master)
5
+ ## [[Unreleased]](https://github.com/esotericpig/nhkore/compare/v0.3.7...HEAD)
6
+
7
+ ## [v0.3.7] - 2020-11-07
8
+
9
+ ### Changed
10
+ - Updated Gem `attr_bool` to v0.2
11
+ - Changed upper-case *'-V'* flag for *version* to be a lower-case *'-v'*
12
+ - Seems like a lot of apps/people expect this
13
+ - Refactored/Formatted some code
14
+ - *nhkore.gemspec* especially
15
+ - Added *samples/*, *Gemfile.lock*, and *.yardopts* to the files in *nhkore.gemspec*
16
+
17
+ ### Fixed
18
+ - ArticleScraper
19
+ - Fixed to accept text nodes that have Kanji, due to bad article:
20
+ - https://www3.nhk.or.jp/news/easy/k10012639271000/k10012639271000.html
21
+ - `第3のビール` should have HTML ruby tags around *第*
6
22
 
7
23
  ## [v0.3.6] - 2020-08-18
8
24
 
@@ -0,0 +1,86 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ nhkore (0.3.7)
5
+ attr_bool (~> 0.2)
6
+ bimyou_segmenter (~> 1.2)
7
+ cri (~> 2.15)
8
+ down (~> 5.1)
9
+ highline (~> 2.0)
10
+ http-cookie (~> 1.0)
11
+ japanese_deinflector (~> 0.0)
12
+ nokogiri (~> 1.10)
13
+ psychgus (~> 1.3)
14
+ public_suffix (~> 4.0)
15
+ rainbow (~> 3.0)
16
+ rubyzip (~> 2.3)
17
+ tiny_segmenter (~> 0.0)
18
+ tty-progressbar (~> 0.17)
19
+ tty-spinner (~> 0.9)
20
+
21
+ GEM
22
+ remote: https://rubygems.org/
23
+ specs:
24
+ addressable (2.7.0)
25
+ public_suffix (>= 2.0.2, < 5.0)
26
+ attr_bool (0.2.1)
27
+ bimyou_segmenter (1.2.0)
28
+ cri (2.15.10)
29
+ domain_name (0.5.20190701)
30
+ unf (>= 0.0.5, < 1.0.0)
31
+ down (5.2.0)
32
+ addressable (~> 2.5)
33
+ highline (2.0.3)
34
+ http-cookie (1.0.3)
35
+ domain_name (~> 0.5)
36
+ japanese_deinflector (0.0.2)
37
+ mini_portile2 (2.4.0)
38
+ minitest (5.14.2)
39
+ nokogiri (1.10.10)
40
+ mini_portile2 (~> 2.4.0)
41
+ psych (3.2.0)
42
+ psychgus (1.3.3)
43
+ psych (>= 3.0)
44
+ public_suffix (4.0.6)
45
+ rainbow (3.0.0)
46
+ rake (13.0.1)
47
+ raketeer (0.2.9)
48
+ rake
49
+ rdoc (6.2.1)
50
+ redcarpet (3.5.0)
51
+ rubyzip (2.3.0)
52
+ strings-ansi (0.1.0)
53
+ tiny_segmenter (0.0.6)
54
+ tty-cursor (0.7.1)
55
+ tty-progressbar (0.17.0)
56
+ strings-ansi (~> 0.1.0)
57
+ tty-cursor (~> 0.7)
58
+ tty-screen (~> 0.7)
59
+ unicode-display_width (~> 1.6)
60
+ tty-screen (0.8.1)
61
+ tty-spinner (0.9.3)
62
+ tty-cursor (~> 0.7)
63
+ unf (0.1.4)
64
+ unf_ext
65
+ unf_ext (0.0.7.7)
66
+ unicode-display_width (1.7.0)
67
+ yard (0.9.25)
68
+ yard_ghurt (1.2.0)
69
+ rake
70
+
71
+ PLATFORMS
72
+ ruby
73
+
74
+ DEPENDENCIES
75
+ bundler (~> 2.1)
76
+ minitest (~> 5.14)
77
+ nhkore!
78
+ rake (~> 13.0)
79
+ raketeer (~> 0.2)
80
+ rdoc (~> 6.2)
81
+ redcarpet (~> 3.5)
82
+ yard (~> 0.9)
83
+ yard_ghurt (~> 1.2)
84
+
85
+ BUNDLED WITH
86
+ 2.1.4
@@ -246,8 +246,7 @@ module NHKore
246
246
  app.scraper_kargs[:header] ||= {}
247
247
  app.scraper_kargs[:header]['user-agent'] = value
248
248
  end
249
- # Big V, not small.
250
- flag :V,:version,'show the version and exit' do |value,cmd|
249
+ flag :v,:version,'show the version and exit' do |value,cmd|
251
250
  app.show_version()
252
251
  exit
253
252
  end
@@ -43,6 +43,8 @@ module NHKore
43
43
  # @since 0.2.0
44
44
  ###
45
45
  class ArticleScraper < Scraper
46
+ extend AttrBool::Ext
47
+
46
48
  attr_reader :cleaners
47
49
  attr_accessor :datetime
48
50
  attr_accessor :dict
@@ -463,7 +465,10 @@ module NHKore
463
465
  return nil
464
466
  end
465
467
 
466
- text = word.kana # Should be kana only
468
+ # Kanji only for:
469
+ # - https://www3.nhk.or.jp/news/easy/k10012639271000/k10012639271000.html
470
+ # - '第3のビール'
471
+ text = word.word # Should usually be kana only
467
472
 
468
473
  result.add_text(text) # No cleaning; raw text
469
474
 
@@ -472,8 +477,9 @@ module NHKore
472
477
  return nil if text.empty?() # No error; empty text is fine here
473
478
 
474
479
  word = Word.new(
475
- kana: text,
476
- word: word
480
+ kana: clean(word.kana),
481
+ kanji: clean(word.kanji),
482
+ word: word,
477
483
  )
478
484
 
479
485
  return word
@@ -34,6 +34,8 @@ module NHKore
34
34
  # @since 0.3.4
35
35
  ###
36
36
  class DatetimeParser
37
+ extend AttrBool::Ext
38
+
37
39
  # Order matters!
38
40
  FMTS = [
39
41
  '%Y-%m-%d %H:%M',
@@ -35,6 +35,8 @@ module NHKore
35
35
  # @since 0.2.0
36
36
  ###
37
37
  class Scraper
38
+ extend AttrBool::Ext
39
+
38
40
  DEFAULT_HEADER = {
39
41
  'user-agent' => UserAgents.sample(),
40
42
  'accept' => 'text/html,application/xhtml+xml,application/xml,application/rss+xml,text/xml;image/webp,image/apng,*/*;application/signed-exchange',
@@ -34,6 +34,8 @@ module NHKore
34
34
  # @since 0.2.0
35
35
  ###
36
36
  class SearchLink
37
+ extend AttrBool::Ext
38
+
37
39
  attr_reader :datetime
38
40
  attr_reader :futsuurl
39
41
  attr_accessor? :scraped
@@ -45,6 +45,16 @@ module NHKore
45
45
  # - https://www3.nhk.or.jp/news/easy/article/disaster_heat.html
46
46
  YASASHII_REGEX = /\A[^\.]+\.#{Regexp.quote(YASASHII_SITE)}.+\.html?/i
47
47
 
48
+ IGNORE_LINK_REGEX = %r{
49
+ /about\.html? # https://www3.nhk.or.jp/news/easy/about.html
50
+ |/movieplayer\.html? # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
51
+ |/audio\.html? # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
52
+ |/news/easy/index\.html? # http://www3.nhk.or.jp/news/easy/index.html
53
+ # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
54
+ # https://www3.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10012689671000&title=「鬼滅の刃」の映画が台湾でも始まって大勢の人が見に行く
55
+ |/enqform\.html?
56
+ }x
57
+
48
58
  # Search Engines are strict, so trigger using the default HTTP header fields
49
59
  # with +header: {}+ and fetch/set the cookie using +eat_cookie: true+.
50
60
  def initialize(url,eat_cookie: true,header: {},**kargs)
@@ -57,11 +67,8 @@ module NHKore
57
67
  link = Util.unspace_web_str(link).downcase() unless cleaned
58
68
 
59
69
  return true if link.empty?()
60
- return true if link =~ /\/about\.html?/ # https://www3.nhk.or.jp/news/easy/about.html
61
- return true if link =~ /\/movieplayer\.html?/ # https://www3.nhk.or.jp/news/easy/movieplayer.html?id=k10038422811_1207251719_1207251728.mp4&teacuprbbs=4feb73432045dbb97c283d64d459f7cf
62
- return true if link =~ /\/audio\.html?/ # https://www3.nhk.or.jp/news/easy/player/audio.html?id=k10011555691000
63
- return true if link =~ /\/news\/easy\/index\.html?/ # http://www3.nhk.or.jp/news/easy/index.html
64
- return true if link =~ /cgi2.*enqform/ # https://cgi2.nhk.or.jp/news/easy/easy_enq/bin/form/enqform.html?id=k10011916321000&title=日本の会社が作った鉄道の車両「あずま」がイギリスで走る
70
+
71
+ return true if IGNORE_LINK_REGEX.match?(link)
65
72
 
66
73
  return false
67
74
  end
@@ -22,5 +22,5 @@
22
22
 
23
23
 
24
24
  module NHKore
25
- VERSION = '0.3.6'
25
+ VERSION = '0.3.7'
26
26
  end
@@ -56,11 +56,15 @@ module NHKore
56
56
  if !unknown.nil?()
57
57
  # kanji?() only tests if it contains kanji, so don't use kana?().
58
58
  if Util.kanji?(unknown)
59
- raise ArgumentError,"unknown[#{unknown}] will overwrite kanji[#{kanji}]" unless Util.empty_web_str?(kanji)
59
+ if !Util.empty_web_str?(kanji)
60
+ raise ArgumentError,"unknown[#{unknown}] will overwrite kanji[#{kanji}]"
61
+ end
60
62
 
61
63
  kanji = unknown
62
64
  else
63
- raise ArgumentError,"unknown[#{unknown}] will overwrite kana[#{kana}]" unless Util.empty_web_str?(kana)
65
+ if !Util.empty_web_str?(kana)
66
+ raise ArgumentError,"unknown[#{unknown}] will overwrite kana[#{kana}]"
67
+ end
64
68
 
65
69
  kana = unknown
66
70
  end
@@ -25,7 +25,6 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
25
25
 
26
26
  require 'nhkore/version'
27
27
 
28
-
29
28
  Gem::Specification.new() do |spec|
30
29
  spec.name = 'nhkore'
31
30
  spec.version = NHKore::VERSION
@@ -39,31 +38,33 @@ Gem::Specification.new() do |spec|
39
38
  ' Includes a CLI app and a scraper library.'
40
39
 
41
40
  spec.metadata = {
42
- 'bug_tracker_uri' => 'https://github.com/esotericpig/nhkore/issues',
43
- 'changelog_uri' => 'https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md',
44
- 'homepage_uri' => 'https://github.com/esotericpig/nhkore',
45
- 'source_code_uri' => 'https://github.com/esotericpig/nhkore'
41
+ 'homepage_uri' => 'https://github.com/esotericpig/nhkore',
42
+ 'source_code_uri' => 'https://github.com/esotericpig/nhkore',
43
+ 'bug_tracker_uri' => 'https://github.com/esotericpig/nhkore/issues',
44
+ 'changelog_uri' => 'https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md',
45
+ #'documentation_uri' => '',
46
+ #'wiki_uri' => '',
47
+ #'mailing_list_uri' => '',
46
48
  }
47
49
 
48
- spec.require_paths = ['lib']
49
- spec.bindir = 'bin'
50
- spec.executables = [spec.name]
50
+ spec.requirements = [
51
+ 'Nokogiri: https://www.nokogiri.org/tutorials/installing_nokogiri.html',
52
+ ]
53
+
54
+ spec.required_ruby_version = '>= 2.4'
55
+ spec.require_paths = ['lib']
56
+ spec.bindir = 'bin'
57
+ spec.executables = [spec.name]
51
58
 
52
59
  spec.files = [
53
60
  Dir.glob(File.join("{#{spec.require_paths.join(',')}}",'**','*.{erb,rb}')),
54
61
  Dir.glob(File.join(spec.bindir,'*')),
55
- Dir.glob(File.join('{test,yard}','**','*.{erb,rb}')),
56
- %W( Gemfile #{spec.name}.gemspec Rakefile ),
57
- %w( CHANGELOG.md LICENSE.txt README.md ),
62
+ Dir.glob(File.join('{samples,test,yard}','**','*.{erb,rb}')),
63
+ %W[ Gemfile Gemfile.lock #{spec.name}.gemspec Rakefile .yardopts ],
64
+ %w[ CHANGELOG.md LICENSE.txt README.md ],
58
65
  ].flatten()
59
66
 
60
- spec.required_ruby_version = '>= 2.4'
61
-
62
- spec.requirements = [
63
- 'Nokogiri: https://www.nokogiri.org/tutorials/installing_nokogiri.html',
64
- ]
65
-
66
- spec.add_runtime_dependency 'attr_bool' ,'~> 0.1' # For attr_accessor?/attr_reader?
67
+ spec.add_runtime_dependency 'attr_bool' ,'~> 0.2' # For attr_accessor?/attr_reader?
67
68
  spec.add_runtime_dependency 'bimyou_segmenter' ,'~> 1.2' # For splitting Japanese sentences into words
68
69
  spec.add_runtime_dependency 'cri' ,'~> 2.15' # For CLI commands/options
69
70
  spec.add_runtime_dependency 'down' ,'~> 5.1' # For downloading files (GetCmd)
@@ -97,8 +98,17 @@ Gem::Specification.new() do |spec|
97
98
  Homepage: #{spec.homepage}
98
99
 
99
100
  Code: #{spec.metadata['source_code_uri']}
100
- Changelog: #{spec.metadata['changelog_uri']}
101
101
  Bugs: #{spec.metadata['bug_tracker_uri']}
102
102
 
103
+ Changelog: #{spec.metadata['changelog_uri']}
104
+
103
105
  EOM
106
+
107
+ spec.extra_rdoc_files = %w[ CHANGELOG.md LICENSE.txt README.md ]
108
+
109
+ spec.rdoc_options = [
110
+ '--hyperlink-all','--show-hash',
111
+ '--title',"NHKore v#{NHKore::VERSION} Doc",
112
+ '--main','README.md',
113
+ ]
104
114
  end
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: UTF-8
3
+ # frozen_string_literal: true
4
+
5
+ #--
6
+ # This file is part of NHKore.
7
+ # Copyright (c) 2020 Jonathan Bradley Whited (@esotericpig)
8
+ #
9
+ # NHKore is free software: you can redistribute it and/or modify
10
+ # it under the terms of the GNU Lesser General Public License as published by
11
+ # the Free Software Foundation, either version 3 of the License, or
12
+ # (at your option) any later version.
13
+ #
14
+ # NHKore is distributed in the hope that it will be useful,
15
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ # GNU Lesser General Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU Lesser General Public License
20
+ # along with NHKore. If not, see <https://www.gnu.org/licenses/>.
21
+ #++
22
+
23
+
24
+ ###
25
+ # If you run this script, be aware that it uses the +-F+ force option
26
+ # (which overwrites files without prompting).
27
+ #
28
+ # @author Jonathan Bradley Whited (@esotericpig)
29
+ # @since 0.3.2
30
+ ###
31
+
32
+ case ARGV[0]
33
+ when '-c' # count
34
+ system('nhkore search ez --show-count')
35
+ puts
36
+ puts %q{Use the first number with the '-a' option.}
37
+ exit
38
+ when '-a' # articles
39
+ articles = ARGV[1].to_i()
40
+ articles = 0 if articles < 0
41
+ else
42
+ puts 'Options:'
43
+ puts ' -c show count to use with -a'
44
+ puts ' -a <int> number of articles already scraped; execute scraping'
45
+ exit
46
+ end
47
+
48
+ articles_inc = 25
49
+ max_errors = 5 # Exit, for example, if 404 errors repeatedly
50
+ max_loop = 5 # Possible total = articles_inc * max_loop
51
+
52
+ thread = Thread.new() do
53
+ i = 0
54
+
55
+ while i < max_loop
56
+ puts "Loop #{i += 1} => #{articles} articles"
57
+
58
+ if system("nhkore -F -t 300 -m 10 news ez -s #{articles_inc}")
59
+ articles += articles_inc
60
+ else
61
+ break if (max_errors -= 1) <= 0
62
+ end
63
+
64
+ puts
65
+ end
66
+ end
67
+
68
+ # Ctrl+C
69
+ trap('INT') do
70
+ if thread.alive?()
71
+ # Try to exit gracefully.
72
+ max_loop = -1
73
+ thread.join(5)
74
+
75
+ # Die!
76
+ thread.kill() if thread.alive?()
77
+ end
78
+
79
+ exit
80
+ end
81
+
82
+ thread.join() # Run
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nhkore
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.3.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Bradley Whited (@esotericpig)
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-18 00:00:00.000000000 Z
11
+ date: 2020-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: attr_bool
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0.1'
19
+ version: '0.2'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0.1'
26
+ version: '0.2'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bimyou_segmenter
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -339,10 +339,15 @@ email:
339
339
  executables:
340
340
  - nhkore
341
341
  extensions: []
342
- extra_rdoc_files: []
342
+ extra_rdoc_files:
343
+ - CHANGELOG.md
344
+ - LICENSE.txt
345
+ - README.md
343
346
  files:
347
+ - ".yardopts"
344
348
  - CHANGELOG.md
345
349
  - Gemfile
350
+ - Gemfile.lock
346
351
  - LICENSE.txt
347
352
  - README.md
348
353
  - Rakefile
@@ -379,6 +384,7 @@ files:
379
384
  - lib/nhkore/version.rb
380
385
  - lib/nhkore/word.rb
381
386
  - nhkore.gemspec
387
+ - samples/looper.rb
382
388
  - test/nhkore/test_helper.rb
383
389
  - test/nhkore_test.rb
384
390
  - yard/templates/default/layout/html/footer.erb
@@ -386,15 +392,22 @@ homepage: https://github.com/esotericpig/nhkore
386
392
  licenses:
387
393
  - LGPL-3.0-or-later
388
394
  metadata:
389
- bug_tracker_uri: https://github.com/esotericpig/nhkore/issues
390
- changelog_uri: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md
391
395
  homepage_uri: https://github.com/esotericpig/nhkore
392
396
  source_code_uri: https://github.com/esotericpig/nhkore
393
- post_install_message: " \n NHKore v0.3.6\n \n You can now use [nhkore] on the
397
+ bug_tracker_uri: https://github.com/esotericpig/nhkore/issues
398
+ changelog_uri: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md
399
+ post_install_message: " \n NHKore v0.3.7\n \n You can now use [nhkore] on the
394
400
  command line.\n \n Homepage: https://github.com/esotericpig/nhkore\n \n Code:
395
- \ https://github.com/esotericpig/nhkore\n Changelog: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md\n
396
- \ Bugs: https://github.com/esotericpig/nhkore/issues\n \n"
397
- rdoc_options: []
401
+ \ https://github.com/esotericpig/nhkore\n Bugs: https://github.com/esotericpig/nhkore/issues\n
402
+ \ \n Changelog: https://github.com/esotericpig/nhkore/blob/master/CHANGELOG.md\n
403
+ \ \n"
404
+ rdoc_options:
405
+ - "--hyperlink-all"
406
+ - "--show-hash"
407
+ - "--title"
408
+ - NHKore v0.3.7 Doc
409
+ - "--main"
410
+ - README.md
398
411
  require_paths:
399
412
  - lib
400
413
  required_ruby_version: !ruby/object:Gem::Requirement