opener-tokenizer 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/tokenizer +2 -2
- data/lib/opener/tokenizer.rb +18 -21
- data/lib/opener/tokenizer/cli.rb +57 -75
- data/lib/opener/tokenizer/version.rb +1 -1
- data/opener-tokenizer.gemspec +3 -1
- metadata +30 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a01694d8c3c4cabbeadcee2b2e478ed699a21555
|
4
|
+
data.tar.gz: 3513b6bd6fe22ea36edc82eb2248204f24dedc8e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3928380d43ccd980b675562c0a2b19e3ae45ebe410d300314685678b9c2829dac0c66b5cc805fbc4a22ed6267563869fe7fca5bee11ddc78e17cea6cabcec5f6
|
7
|
+
data.tar.gz: dc0a947729d4f97f49a919a240c8ab73b8122315add3aeecc65cd8ada3c0474e5faca2d4c0e84f69dce77ad74937f22e7b65932bae146c5592b129d17f5ed42f
|
data/bin/tokenizer
CHANGED
data/lib/opener/tokenizer.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'opener/tokenizers/base'
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'open3'
|
4
|
-
require '
|
4
|
+
require 'slop'
|
5
5
|
|
6
6
|
require_relative 'tokenizer/version'
|
7
7
|
require_relative 'tokenizer/cli'
|
@@ -52,35 +52,32 @@ module Opener
|
|
52
52
|
end
|
53
53
|
|
54
54
|
##
|
55
|
-
#
|
56
|
-
# STDERR and an object containing process information.
|
55
|
+
# Tokenizes the input and returns the results as a KAF document.
|
57
56
|
#
|
58
57
|
# @param [String] input
|
59
|
-
# @return [
|
58
|
+
# @return [String]
|
60
59
|
#
|
61
60
|
def run(input)
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
61
|
+
if options[:kaf]
|
62
|
+
language, input = kaf_elements(input)
|
63
|
+
else
|
64
|
+
language = options[:language]
|
65
|
+
end
|
68
66
|
|
69
|
-
|
70
|
-
|
71
|
-
|
67
|
+
unless valid_language?(language)
|
68
|
+
raise ArgumentError, "The specified language (#{language}) is invalid"
|
69
|
+
end
|
72
70
|
|
73
|
-
|
71
|
+
kernel = language_constant(language).new(:args => options[:args])
|
74
72
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
stdout, stderr, process = Open3.capture3(
|
74
|
+
*kernel.command.split(" "),
|
75
|
+
:stdin_data => input
|
76
|
+
)
|
79
77
|
|
80
|
-
|
78
|
+
raise stderr unless process.success?
|
81
79
|
|
82
|
-
|
83
|
-
end
|
80
|
+
return stdout
|
84
81
|
end
|
85
82
|
|
86
83
|
alias tokenize run
|
data/lib/opener/tokenizer/cli.rb
CHANGED
@@ -1,110 +1,92 @@
|
|
1
1
|
module Opener
|
2
2
|
class Tokenizer
|
3
3
|
##
|
4
|
-
# CLI wrapper around {Opener::Tokenizer} using
|
4
|
+
# CLI wrapper around {Opener::Tokenizer} using Slop.
|
5
5
|
#
|
6
|
-
# @!attribute [r]
|
7
|
-
# @return [
|
8
|
-
# @!attribute [r] option_parser
|
9
|
-
# @return [OptionParser]
|
6
|
+
# @!attribute [r] parser
|
7
|
+
# @return [Slop]
|
10
8
|
#
|
11
9
|
class CLI
|
12
|
-
attr_reader :
|
10
|
+
attr_reader :parser
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@parser = configure_slop
|
14
|
+
end
|
13
15
|
|
14
16
|
##
|
15
|
-
# @param [
|
17
|
+
# @param [Array] argv
|
16
18
|
#
|
17
|
-
def
|
18
|
-
|
19
|
-
|
20
|
-
@option_parser = OptionParser.new do |opts|
|
21
|
-
opts.program_name = 'tokenizer'
|
22
|
-
opts.summary_indent = ' '
|
23
|
-
|
24
|
-
opts.on('-h', '--help', 'Shows this help message') do
|
25
|
-
show_help
|
26
|
-
end
|
27
|
-
|
28
|
-
opts.on('-v', '--version', 'Shows the current version') do
|
29
|
-
show_version
|
30
|
-
end
|
19
|
+
def run(argv = ARGV)
|
20
|
+
parser.parse(argv)
|
21
|
+
end
|
31
22
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
@options[:kaf] = false
|
39
|
-
end
|
23
|
+
##
|
24
|
+
# @return [Slop]
|
25
|
+
#
|
26
|
+
def configure_slop
|
27
|
+
return Slop.new(:strict => false, :indent => 2, :help => true) do
|
28
|
+
banner 'Usage: tokenizer [OPTIONS]'
|
40
29
|
|
41
|
-
|
42
|
-
@options[:kaf] = true
|
43
|
-
end
|
30
|
+
separator <<-EOF.chomp
|
44
31
|
|
45
|
-
|
46
|
-
@options[:kaf] = false
|
47
|
-
end
|
32
|
+
About:
|
48
33
|
|
49
|
-
|
34
|
+
Tokenizer for KAF/plain text documents with support for various languages
|
35
|
+
such as Dutch and English. This command reads input from STDIN.
|
50
36
|
|
51
37
|
Examples:
|
52
38
|
|
53
|
-
|
54
|
-
|
39
|
+
cat example.txt | tokenizer -l en # Manually specify the language
|
40
|
+
cat example.kaf | tokenizer # Uses the xml:lang attribute
|
55
41
|
|
56
42
|
Languages:
|
57
43
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
44
|
+
* Dutch (nl)
|
45
|
+
* English (en)
|
46
|
+
* French (fr)
|
47
|
+
* German (de)
|
48
|
+
* Italian (it)
|
49
|
+
* Spanish (es)
|
64
50
|
|
65
51
|
KAF Input:
|
66
52
|
|
67
|
-
|
68
|
-
|
69
|
-
|
53
|
+
If you give a KAF file as an input (-k or --kaf) the language is taken from
|
54
|
+
the xml:lang attribute inside the file. Else it expects that you give the
|
55
|
+
language as an argument (-l or --language)
|
70
56
|
|
71
|
-
|
57
|
+
Example KAF:
|
72
58
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
59
|
+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
60
|
+
<KAF version="v1.opener" xml:lang="en">
|
61
|
+
<raw>This is some text.</raw>
|
62
|
+
</KAF>
|
77
63
|
EOF
|
78
|
-
end
|
79
|
-
end
|
80
64
|
|
81
|
-
|
82
|
-
# @param [String] input
|
83
|
-
#
|
84
|
-
def run(input)
|
85
|
-
option_parser.parse!(options[:args])
|
65
|
+
separator "\nOptions:\n"
|
86
66
|
|
87
|
-
|
67
|
+
on :v, :version, 'Shows the current version' do
|
68
|
+
abort "tokenizer v#{VERSION} on #{RUBY_DESCRIPTION}"
|
69
|
+
end
|
88
70
|
|
89
|
-
|
71
|
+
on :l=, :language=, 'A specific language to use',
|
72
|
+
:as => String,
|
73
|
+
:default => DEFAULT_LANGUAGE
|
90
74
|
|
91
|
-
|
92
|
-
|
75
|
+
on :k, :kaf, 'Treats the input as a KAF document'
|
76
|
+
on :p, :plain, 'Treats the input as plain text'
|
93
77
|
|
94
|
-
|
78
|
+
run do |opts, args|
|
79
|
+
tokenizer = Tokenizer.new(
|
80
|
+
:args => args,
|
81
|
+
:kaf => opts[:plain] ? false : true,
|
82
|
+
:language => opts[:language]
|
83
|
+
)
|
95
84
|
|
96
|
-
|
97
|
-
# Shows the help message and exits the program.
|
98
|
-
#
|
99
|
-
def show_help
|
100
|
-
abort option_parser.to_s
|
101
|
-
end
|
85
|
+
input = STDIN.tty? ? nil : STDIN.read
|
102
86
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
def show_version
|
107
|
-
abort "#{option_parser.program_name} v#{VERSION} on #{RUBY_DESCRIPTION}"
|
87
|
+
puts tokenizer.run(input)
|
88
|
+
end
|
89
|
+
end
|
108
90
|
end
|
109
91
|
end # CLI
|
110
92
|
end # Tokenizer
|
data/opener-tokenizer.gemspec
CHANGED
@@ -24,12 +24,14 @@ Gem::Specification.new do |gem|
|
|
24
24
|
|
25
25
|
gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
|
26
26
|
|
27
|
-
gem.add_dependency 'nokogiri'
|
28
27
|
gem.add_dependency 'opener-tokenizer-base', '~> 1.0'
|
29
28
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
30
29
|
gem.add_dependency 'opener-daemons', '~> 2.1'
|
31
30
|
gem.add_dependency 'opener-core', '~> 2.0'
|
32
31
|
|
32
|
+
gem.add_dependency 'nokogiri'
|
33
|
+
gem.add_dependency 'slop', '~> 3.5'
|
34
|
+
|
33
35
|
gem.add_development_dependency 'rspec'
|
34
36
|
gem.add_development_dependency 'cucumber'
|
35
37
|
gem.add_development_dependency 'pry'
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: nokogiri
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: opener-tokenizer-base
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -80,6 +66,34 @@ dependencies:
|
|
80
66
|
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
68
|
version: '2.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: slop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '3.5'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '3.5'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: rspec
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|