opener-tokenizer 2.0.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/tokenizer +2 -2
- data/lib/opener/tokenizer.rb +18 -21
- data/lib/opener/tokenizer/cli.rb +57 -75
- data/lib/opener/tokenizer/version.rb +1 -1
- data/opener-tokenizer.gemspec +3 -1
- metadata +30 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a01694d8c3c4cabbeadcee2b2e478ed699a21555
|
4
|
+
data.tar.gz: 3513b6bd6fe22ea36edc82eb2248204f24dedc8e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3928380d43ccd980b675562c0a2b19e3ae45ebe410d300314685678b9c2829dac0c66b5cc805fbc4a22ed6267563869fe7fca5bee11ddc78e17cea6cabcec5f6
|
7
|
+
data.tar.gz: dc0a947729d4f97f49a919a240c8ab73b8122315add3aeecc65cd8ada3c0474e5faca2d4c0e84f69dce77ad74937f22e7b65932bae146c5592b129d17f5ed42f
|
data/bin/tokenizer
CHANGED
data/lib/opener/tokenizer.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'opener/tokenizers/base'
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'open3'
|
4
|
-
require '
|
4
|
+
require 'slop'
|
5
5
|
|
6
6
|
require_relative 'tokenizer/version'
|
7
7
|
require_relative 'tokenizer/cli'
|
@@ -52,35 +52,32 @@ module Opener
|
|
52
52
|
end
|
53
53
|
|
54
54
|
##
|
55
|
-
#
|
56
|
-
# STDERR and an object containing process information.
|
55
|
+
# Tokenizes the input and returns the results as a KAF document.
|
57
56
|
#
|
58
57
|
# @param [String] input
|
59
|
-
# @return [
|
58
|
+
# @return [String]
|
60
59
|
#
|
61
60
|
def run(input)
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
61
|
+
if options[:kaf]
|
62
|
+
language, input = kaf_elements(input)
|
63
|
+
else
|
64
|
+
language = options[:language]
|
65
|
+
end
|
68
66
|
|
69
|
-
|
70
|
-
|
71
|
-
|
67
|
+
unless valid_language?(language)
|
68
|
+
raise ArgumentError, "The specified language (#{language}) is invalid"
|
69
|
+
end
|
72
70
|
|
73
|
-
|
71
|
+
kernel = language_constant(language).new(:args => options[:args])
|
74
72
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
73
|
+
stdout, stderr, process = Open3.capture3(
|
74
|
+
*kernel.command.split(" "),
|
75
|
+
:stdin_data => input
|
76
|
+
)
|
79
77
|
|
80
|
-
|
78
|
+
raise stderr unless process.success?
|
81
79
|
|
82
|
-
|
83
|
-
end
|
80
|
+
return stdout
|
84
81
|
end
|
85
82
|
|
86
83
|
alias tokenize run
|
data/lib/opener/tokenizer/cli.rb
CHANGED
@@ -1,110 +1,92 @@
|
|
1
1
|
module Opener
|
2
2
|
class Tokenizer
|
3
3
|
##
|
4
|
-
# CLI wrapper around {Opener::Tokenizer} using
|
4
|
+
# CLI wrapper around {Opener::Tokenizer} using Slop.
|
5
5
|
#
|
6
|
-
# @!attribute [r]
|
7
|
-
# @return [
|
8
|
-
# @!attribute [r] option_parser
|
9
|
-
# @return [OptionParser]
|
6
|
+
# @!attribute [r] parser
|
7
|
+
# @return [Slop]
|
10
8
|
#
|
11
9
|
class CLI
|
12
|
-
attr_reader :
|
10
|
+
attr_reader :parser
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@parser = configure_slop
|
14
|
+
end
|
13
15
|
|
14
16
|
##
|
15
|
-
# @param [
|
17
|
+
# @param [Array] argv
|
16
18
|
#
|
17
|
-
def
|
18
|
-
|
19
|
-
|
20
|
-
@option_parser = OptionParser.new do |opts|
|
21
|
-
opts.program_name = 'tokenizer'
|
22
|
-
opts.summary_indent = ' '
|
23
|
-
|
24
|
-
opts.on('-h', '--help', 'Shows this help message') do
|
25
|
-
show_help
|
26
|
-
end
|
27
|
-
|
28
|
-
opts.on('-v', '--version', 'Shows the current version') do
|
29
|
-
show_version
|
30
|
-
end
|
19
|
+
def run(argv = ARGV)
|
20
|
+
parser.parse(argv)
|
21
|
+
end
|
31
22
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
@options[:kaf] = false
|
39
|
-
end
|
23
|
+
##
|
24
|
+
# @return [Slop]
|
25
|
+
#
|
26
|
+
def configure_slop
|
27
|
+
return Slop.new(:strict => false, :indent => 2, :help => true) do
|
28
|
+
banner 'Usage: tokenizer [OPTIONS]'
|
40
29
|
|
41
|
-
|
42
|
-
@options[:kaf] = true
|
43
|
-
end
|
30
|
+
separator <<-EOF.chomp
|
44
31
|
|
45
|
-
|
46
|
-
@options[:kaf] = false
|
47
|
-
end
|
32
|
+
About:
|
48
33
|
|
49
|
-
|
34
|
+
Tokenizer for KAF/plain text documents with support for various languages
|
35
|
+
such as Dutch and English. This command reads input from STDIN.
|
50
36
|
|
51
37
|
Examples:
|
52
38
|
|
53
|
-
|
54
|
-
|
39
|
+
cat example.txt | tokenizer -l en # Manually specify the language
|
40
|
+
cat example.kaf | tokenizer # Uses the xml:lang attribute
|
55
41
|
|
56
42
|
Languages:
|
57
43
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
44
|
+
* Dutch (nl)
|
45
|
+
* English (en)
|
46
|
+
* French (fr)
|
47
|
+
* German (de)
|
48
|
+
* Italian (it)
|
49
|
+
* Spanish (es)
|
64
50
|
|
65
51
|
KAF Input:
|
66
52
|
|
67
|
-
|
68
|
-
|
69
|
-
|
53
|
+
If you give a KAF file as an input (-k or --kaf) the language is taken from
|
54
|
+
the xml:lang attribute inside the file. Else it expects that you give the
|
55
|
+
language as an argument (-l or --language)
|
70
56
|
|
71
|
-
|
57
|
+
Example KAF:
|
72
58
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
59
|
+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
60
|
+
<KAF version="v1.opener" xml:lang="en">
|
61
|
+
<raw>This is some text.</raw>
|
62
|
+
</KAF>
|
77
63
|
EOF
|
78
|
-
end
|
79
|
-
end
|
80
64
|
|
81
|
-
|
82
|
-
# @param [String] input
|
83
|
-
#
|
84
|
-
def run(input)
|
85
|
-
option_parser.parse!(options[:args])
|
65
|
+
separator "\nOptions:\n"
|
86
66
|
|
87
|
-
|
67
|
+
on :v, :version, 'Shows the current version' do
|
68
|
+
abort "tokenizer v#{VERSION} on #{RUBY_DESCRIPTION}"
|
69
|
+
end
|
88
70
|
|
89
|
-
|
71
|
+
on :l=, :language=, 'A specific language to use',
|
72
|
+
:as => String,
|
73
|
+
:default => DEFAULT_LANGUAGE
|
90
74
|
|
91
|
-
|
92
|
-
|
75
|
+
on :k, :kaf, 'Treats the input as a KAF document'
|
76
|
+
on :p, :plain, 'Treats the input as plain text'
|
93
77
|
|
94
|
-
|
78
|
+
run do |opts, args|
|
79
|
+
tokenizer = Tokenizer.new(
|
80
|
+
:args => args,
|
81
|
+
:kaf => opts[:plain] ? false : true,
|
82
|
+
:language => opts[:language]
|
83
|
+
)
|
95
84
|
|
96
|
-
|
97
|
-
# Shows the help message and exits the program.
|
98
|
-
#
|
99
|
-
def show_help
|
100
|
-
abort option_parser.to_s
|
101
|
-
end
|
85
|
+
input = STDIN.tty? ? nil : STDIN.read
|
102
86
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
def show_version
|
107
|
-
abort "#{option_parser.program_name} v#{VERSION} on #{RUBY_DESCRIPTION}"
|
87
|
+
puts tokenizer.run(input)
|
88
|
+
end
|
89
|
+
end
|
108
90
|
end
|
109
91
|
end # CLI
|
110
92
|
end # Tokenizer
|
data/opener-tokenizer.gemspec
CHANGED
@@ -24,12 +24,14 @@ Gem::Specification.new do |gem|
|
|
24
24
|
|
25
25
|
gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
|
26
26
|
|
27
|
-
gem.add_dependency 'nokogiri'
|
28
27
|
gem.add_dependency 'opener-tokenizer-base', '~> 1.0'
|
29
28
|
gem.add_dependency 'opener-webservice', '~> 2.1'
|
30
29
|
gem.add_dependency 'opener-daemons', '~> 2.1'
|
31
30
|
gem.add_dependency 'opener-core', '~> 2.0'
|
32
31
|
|
32
|
+
gem.add_dependency 'nokogiri'
|
33
|
+
gem.add_dependency 'slop', '~> 3.5'
|
34
|
+
|
33
35
|
gem.add_development_dependency 'rspec'
|
34
36
|
gem.add_development_dependency 'cucumber'
|
35
37
|
gem.add_development_dependency 'pry'
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: opener-tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- development@olery.com
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-11-
|
11
|
+
date: 2014-11-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: nokogiri
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ">="
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ">="
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: opener-tokenizer-base
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -80,6 +66,34 @@ dependencies:
|
|
80
66
|
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
68
|
version: '2.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: slop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '3.5'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '3.5'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: rspec
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|