opener-tokenizer-base 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # This class tokenizes input sentence
4
+ # Implements Moses tokenizer and it has been modified
5
+ # for OpeNER by Aitor García and Andoni Azpeitia
6
+
7
+ use FindBin;
8
+ use utf8;
9
+
10
+ my %NONBREAKING_PREFIX = ();
11
+ my $LANGUAGE;
12
+ my $SUBSTITUTE = "####";
13
+
14
+ sub init_tokenizer {
15
+ $LANGUAGE = shift(@_);
16
+ %NONBREAKING_PREFIX = %{ shift(@_) };
17
+ }
18
+
19
+ sub tokenize {
20
+
21
+ my($text) = shift(@_);
22
+ chomp($text);
23
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
24
+ #tokenize the dashes of the beginning of the lines
25
+ $text =~ s/^\-([^ ])/\- $1/g;
26
+
27
+ # turn ’ into '
28
+ $text =~ s/Ž/\'/g;
29
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
30
+
31
+ $text = " $text ";
32
+ # seperate out all "other" special characters
33
+ $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-\’])/ $1 /g;
34
+ #$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
35
+ #multi-dots stay together
36
+ $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
37
+ while($text =~ /DOTMULTI\./) {
38
+ $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
39
+ $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
40
+ }
41
+ # seperate out "," except if within numbers (5,300)
42
+ $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
43
+ # separate , pre and post number
44
+ $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
45
+ $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
46
+
47
+ # turn `into '
48
+ $text =~ s/\`/\'$SUBSTITUTE/g;
49
+
50
+ #turn '' into "
51
+ $text =~ s/\'\'/ \"$SUBSTITUTE /g;
52
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
53
+ #tokenize the words like '05-'06
54
+ $text =~ s/(['|’])([0-9][0-9])\-(['|’])([0-9][0-9])/$1$2 - $3$4/g;
55
+ #replace the ' with ### to don't tokenize words like '90
56
+ $text =~ s/ ['|’]([0-9][0-9])/ ###$1/g;
57
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
58
+ if ($LANGUAGE eq "en") {
59
+ #split contractions right
60
+ $text =~ s/([^\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
61
+ $text =~ s/([^\p{IsAlpha}\p{IsN}])(['|’])([\p{IsAlpha}])/$1 $2 $3/g;
62
+ $text =~ s/([\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
63
+ $text =~ s/([\p{IsAlpha}])(['|’])([\p{IsAlpha}])/$1 $2$3/g;
64
+ #special case for "1990's"
65
+ $text =~ s/([\p{IsN}])(['|’])([s])/$1 $2$3/g;
66
+ } elsif ($LANGUAGE eq "fr") {
67
+ #split contractions left
68
+ $text =~ s/([^\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
69
+ $text =~ s/([^\p{IsAlpha}])(['|’])([\p{IsAlpha}])/$1 $2 $3/g;
70
+ $text =~ s/([\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
71
+ $text =~ s/([\p{IsAlpha}])(['|’])([\p{IsAlpha}])/$1$2 $3/g;
72
+ } elsif ($LANGUAGE eq "it") {
73
+ #split contractions left
74
+ $text =~ s/([^\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
75
+ $text =~ s/([^\p{IsAlpha}])(['|’])([\p{IsAlpha}])/$1 $2 $3/g;
76
+ $text =~ s/([\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
77
+ $text =~ s/([\p{IsAlpha}])(['|’])([\p{IsAlpha}])/$1$2 $3/g;
78
+ $text =~ s/([^\p{IsAlpha}\p{IsN}]po) (['|’])([^\p{IsAlpha}])/$1$2 $3/g; # rule for "po'"
79
+ } else {
80
+ $text =~ s/\'/ \' /g;
81
+ }
82
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
83
+ #replace the ### with ' to tokenize words like '90
84
+ $text =~ s/ ###([0-9][0-9])/ '$1/g;
85
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
86
+
87
+ #word token method
88
+ my @words = split(/\s/,$text);
89
+ $text = "";
90
+ for (my $i=0;$i<(scalar(@words));$i++) {
91
+ my $word = $words[$i];
92
+ if ( $word =~ /^(\S+)\.$/) {
93
+ my $pre = $1;
94
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
95
+ #no change
96
+ } elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
97
+ #no change
98
+ } else {
99
+ $word = $pre." .";
100
+ }
101
+ }
102
+ $text .= $word." ";
103
+ }
104
+
105
+ # clean up extraneous spaces
106
+ $text =~ s/ +/ /g;
107
+ $text =~ s/^ //g;
108
+ $text =~ s/ $//g;
109
+
110
+ #restore multi-dots
111
+ while($text =~ /DOTDOTMULTI/) {
112
+ $text =~ s/DOTDOTMULTI/DOTMULTI./g;
113
+ }
114
+ $text =~ s/DOTMULTI/./g;
115
+
116
+ #detokenize URLs
117
+ $text = &detokenize_urls($text);
118
+
119
+ #ensure final line break
120
+ $text .= "\n" unless $text =~ /\n$/;
121
+ return $text;
122
+ }
123
+
124
+ sub detokenize_urls {
125
+
126
+ my($text) = shift(@_);
127
+
128
+ $text =~ s/(\w{3,9}) : \/ \/ /$1:\/\//g;
129
+ my $URL_HEAD_PATTERN = "\\w{3,9}:\\/\\/|www";
130
+ my $URL_BODY_PATTERN = "\\w\\d\\.\\/\\-\\#;:=\\+\\?&_";
131
+ my $URL_SPECIAL_PATTERN = "\\/|\\?|=|&|\\+|_|\\#|:|;|\\-";
132
+ while ( $text =~ /($URL_HEAD_PATTERN)[$URL_BODY_PATTERN]+ ($URL_SPECIAL_PATTERN)/ ) {
133
+ $text =~ s/($URL_HEAD_PATTERN)([$URL_BODY_PATTERN]+) ($URL_SPECIAL_PATTERN) {0,1}(($URL_SPECIAL_PATTERN? {0,1})+)/$1.$2.$3.&clean($4)/eg;
134
+ }
135
+
136
+ return $text;
137
+ }
138
+
139
+ sub clean {
140
+ my $text = shift(@_);
141
+ $text = s/ //g;
142
+ return $text;
143
+ }
144
+
145
+ 1;
@@ -0,0 +1,84 @@
1
+ require_relative 'base/version'
2
+
3
+ module Opener
4
+ module Tokenizers
5
+ class Base
6
+ attr_reader :language
7
+
8
+ def initialize(opts={})
9
+ @language ||= opts[:language] || lang
10
+ end
11
+
12
+ def command(opts=[])
13
+ "perl -I #{lib} #{kernel} #{language} #{opts.join(' ')}"
14
+ end
15
+
16
+ def run(opts=ARGV)
17
+ `#{command(opts)}`
18
+ end
19
+
20
+ def set_language(language)
21
+ @language = language
22
+ end
23
+
24
+ protected
25
+
26
+ def core_dir
27
+ File.expand_path("../../../../core", __FILE__)
28
+ end
29
+
30
+ def kernel
31
+ File.join(core_dir,'tokenizer-cli.pl')
32
+ end
33
+
34
+ def lib
35
+ File.join(core_dir,'lib/') # Trailing / is required
36
+ end
37
+
38
+ def language
39
+ return @language.nil? ? nil : "-l #{@language}"
40
+ end
41
+
42
+ def lang
43
+ 'en'
44
+ end
45
+
46
+ end
47
+
48
+ class EN < Base
49
+ def lang
50
+ 'en'
51
+ end
52
+ end
53
+
54
+ class DE < Base
55
+ def lang
56
+ 'de'
57
+ end
58
+ end
59
+
60
+ class NL < Base
61
+ def lang
62
+ 'nl'
63
+ end
64
+ end
65
+
66
+ class ES < Base
67
+ def lang
68
+ 'es'
69
+ end
70
+ end
71
+
72
+ class IT < Base
73
+ def lang
74
+ 'it'
75
+ end
76
+ end
77
+
78
+ class FR < Base
79
+ def lang
80
+ 'fr'
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,8 @@
1
+ module Opener
2
+ module Tokenizers
3
+ class Base
4
+ VERSION = "1.0.0"
5
+ end
6
+ end
7
+ end
8
+
@@ -0,0 +1,25 @@
1
+ require File.expand_path('../lib/opener/tokenizers/base/version', __FILE__)
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.name = 'opener-tokenizer-base'
5
+ gem.version = Opener::Tokenizers::Base::VERSION
6
+ gem.authors = ['development@olery.com']
7
+ gem.summary = 'Tokenize English, Dutch, German, Italian and Spanish to KAF'
8
+ gem.description = gem.summary
9
+ gem.homepage = 'http://opener-project.github.com/'
10
+ gem.has_rdoc = "yard"
11
+ gem.required_ruby_version = ">= 1.9.2"
12
+
13
+ gem.files = Dir.glob([
14
+ 'core/**/*',
15
+ 'lib/**/*',
16
+ '*.gemspec',
17
+ 'README.md'
18
+ ]).select { |file| File.file?(file) }
19
+
20
+ gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
21
+
22
+ gem.add_development_dependency 'cucumber'
23
+ gem.add_development_dependency 'rspec'
24
+ gem.add_development_dependency 'rake'
25
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: opener-tokenizer-base
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - development@olery.com
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: cucumber
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Tokenize English, Dutch, German, Italian and Spanish to KAF
56
+ email:
57
+ executables:
58
+ - tokenizer-en
59
+ - tokenizer-it
60
+ - tokenizer-nl
61
+ - tokenizer-base
62
+ - tokenizer-es
63
+ - tokenizer-fr
64
+ - tokenizer-de
65
+ extensions: []
66
+ extra_rdoc_files: []
67
+ files:
68
+ - README.md
69
+ - bin/tokenizer-base
70
+ - bin/tokenizer-de
71
+ - bin/tokenizer-en
72
+ - bin/tokenizer-es
73
+ - bin/tokenizer-fr
74
+ - bin/tokenizer-it
75
+ - bin/tokenizer-nl
76
+ - core/lib/Data/OptList.pm
77
+ - core/lib/Params/Util.pm
78
+ - core/lib/Sub/Exporter.pm
79
+ - core/lib/Sub/Exporter/Cookbook.pod
80
+ - core/lib/Sub/Exporter/Tutorial.pod
81
+ - core/lib/Sub/Exporter/Util.pm
82
+ - core/lib/Sub/Install.pm
83
+ - core/lib/Time/Stamp.pm
84
+ - core/load-prefixes.pl
85
+ - core/nonbreaking_prefixes/abbreviation_list.kaf
86
+ - core/nonbreaking_prefixes/abbreviation_list.txt
87
+ - core/nonbreaking_prefixes/nonbreaking_prefix.ca
88
+ - core/nonbreaking_prefixes/nonbreaking_prefix.de
89
+ - core/nonbreaking_prefixes/nonbreaking_prefix.el
90
+ - core/nonbreaking_prefixes/nonbreaking_prefix.en
91
+ - core/nonbreaking_prefixes/nonbreaking_prefix.es
92
+ - core/nonbreaking_prefixes/nonbreaking_prefix.fr
93
+ - core/nonbreaking_prefixes/nonbreaking_prefix.is
94
+ - core/nonbreaking_prefixes/nonbreaking_prefix.it
95
+ - core/nonbreaking_prefixes/nonbreaking_prefix.nl
96
+ - core/nonbreaking_prefixes/nonbreaking_prefix.pl
97
+ - core/nonbreaking_prefixes/nonbreaking_prefix.pt
98
+ - core/nonbreaking_prefixes/nonbreaking_prefix.ro
99
+ - core/nonbreaking_prefixes/nonbreaking_prefix.ru
100
+ - core/nonbreaking_prefixes/nonbreaking_prefix.sk
101
+ - core/nonbreaking_prefixes/nonbreaking_prefix.sl
102
+ - core/nonbreaking_prefixes/nonbreaking_prefix.sv
103
+ - core/split-sentences.pl
104
+ - core/text-fixer.pl
105
+ - core/tokenizer-cli.pl
106
+ - core/tokenizer.pl
107
+ - lib/opener/tokenizers/base.rb
108
+ - lib/opener/tokenizers/base/version.rb
109
+ - opener-tokenizer-base.gemspec
110
+ homepage: http://opener-project.github.com/
111
+ licenses: []
112
+ metadata: {}
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ version: 1.9.2
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ requirements: []
128
+ rubyforge_project:
129
+ rubygems_version: 2.2.2
130
+ signing_key:
131
+ specification_version: 4
132
+ summary: Tokenize English, Dutch, German, Italian and Spanish to KAF
133
+ test_files: []
134
+ has_rdoc: yard