opener-tokenizer-base 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,145 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # This class tokenizes input sentence
4
+ # Implements Moses tokenizer and it has been modified
5
+ # for OpeNER by Aitor García and Andoni Azpeitia
6
+
7
+ use FindBin;
8
+ use utf8;
9
+
10
+ my %NONBREAKING_PREFIX = ();
11
+ my $LANGUAGE;
12
+ my $SUBSTITUTE = "####";
13
+
14
+ sub init_tokenizer {
15
+ $LANGUAGE = shift(@_);
16
+ %NONBREAKING_PREFIX = %{ shift(@_) };
17
+ }
18
+
19
+ sub tokenize {
20
+
21
+ my($text) = shift(@_);
22
+ chomp($text);
23
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
24
+ #tokenize the dashes of the beginning of the lines
25
+ $text =~ s/^\-([^ ])/\- $1/g;
26
+
27
+ # turn ’ into '
28
+ $text =~ s/Ž/\'/g;
29
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
30
+
31
+ $text = " $text ";
32
+ # seperate out all "other" special characters
33
+ $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-\’])/ $1 /g;
34
+ #$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
35
+ #multi-dots stay together
36
+ $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
37
+ while($text =~ /DOTMULTI\./) {
38
+ $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
39
+ $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
40
+ }
41
+ # seperate out "," except if within numbers (5,300)
42
+ $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
43
+ # separate , pre and post number
44
+ $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
45
+ $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
46
+
47
+ # turn `into '
48
+ $text =~ s/\`/\'$SUBSTITUTE/g;
49
+
50
+ #turn '' into "
51
+ $text =~ s/\'\'/ \"$SUBSTITUTE /g;
52
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
53
+ #tokenize the words like '05-'06
54
+ $text =~ s/(['|’])([0-9][0-9])\-(['|’])([0-9][0-9])/$1$2 - $3$4/g;
55
+ #replace the ' with ### to don't tokenize words like '90
56
+ $text =~ s/ ['|’]([0-9][0-9])/ ###$1/g;
57
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
58
+ if ($LANGUAGE eq "en") {
59
+ #split contractions right
60
+ $text =~ s/([^\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
61
+ $text =~ s/([^\p{IsAlpha}\p{IsN}])(['|’])([\p{IsAlpha}])/$1 $2 $3/g;
62
+ $text =~ s/([\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
63
+ $text =~ s/([\p{IsAlpha}])(['|’])([\p{IsAlpha}])/$1 $2$3/g;
64
+ #special case for "1990's"
65
+ $text =~ s/([\p{IsN}])(['|’])([s])/$1 $2$3/g;
66
+ } elsif ($LANGUAGE eq "fr") {
67
+ #split contractions left
68
+ $text =~ s/([^\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
69
+ $text =~ s/([^\p{IsAlpha}])(['|’])([\p{IsAlpha}])/$1 $2 $3/g;
70
+ $text =~ s/([\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
71
+ $text =~ s/([\p{IsAlpha}])(['|’])([\p{IsAlpha}])/$1$2 $3/g;
72
+ } elsif ($LANGUAGE eq "it") {
73
+ #split contractions left
74
+ $text =~ s/([^\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
75
+ $text =~ s/([^\p{IsAlpha}])(['|’])([\p{IsAlpha}])/$1 $2 $3/g;
76
+ $text =~ s/([\p{IsAlpha}])(['|’])([^\p{IsAlpha}])/$1 $2 $3/g;
77
+ $text =~ s/([\p{IsAlpha}])(['|’])([\p{IsAlpha}])/$1$2 $3/g;
78
+ $text =~ s/([^\p{IsAlpha}\p{IsN}]po) (['|’])([^\p{IsAlpha}])/$1$2 $3/g; # rule for "po'"
79
+ } else {
80
+ $text =~ s/\'/ \' /g;
81
+ }
82
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
83
+ #replace the ### with ' to tokenize words like '90
84
+ $text =~ s/ ###([0-9][0-9])/ '$1/g;
85
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
86
+
87
+ #word token method
88
+ my @words = split(/\s/,$text);
89
+ $text = "";
90
+ for (my $i=0;$i<(scalar(@words));$i++) {
91
+ my $word = $words[$i];
92
+ if ( $word =~ /^(\S+)\.$/) {
93
+ my $pre = $1;
94
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
95
+ #no change
96
+ } elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
97
+ #no change
98
+ } else {
99
+ $word = $pre." .";
100
+ }
101
+ }
102
+ $text .= $word." ";
103
+ }
104
+
105
+ # clean up extraneous spaces
106
+ $text =~ s/ +/ /g;
107
+ $text =~ s/^ //g;
108
+ $text =~ s/ $//g;
109
+
110
+ #restore multi-dots
111
+ while($text =~ /DOTDOTMULTI/) {
112
+ $text =~ s/DOTDOTMULTI/DOTMULTI./g;
113
+ }
114
+ $text =~ s/DOTMULTI/./g;
115
+
116
+ #detokenize URLs
117
+ $text = &detokenize_urls($text);
118
+
119
+ #ensure final line break
120
+ $text .= "\n" unless $text =~ /\n$/;
121
+ return $text;
122
+ }
123
+
124
+ sub detokenize_urls {
125
+
126
+ my($text) = shift(@_);
127
+
128
+ $text =~ s/(\w{3,9}) : \/ \/ /$1:\/\//g;
129
+ my $URL_HEAD_PATTERN = "\\w{3,9}:\\/\\/|www";
130
+ my $URL_BODY_PATTERN = "\\w\\d\\.\\/\\-\\#;:=\\+\\?&_";
131
+ my $URL_SPECIAL_PATTERN = "\\/|\\?|=|&|\\+|_|\\#|:|;|\\-";
132
+ while ( $text =~ /($URL_HEAD_PATTERN)[$URL_BODY_PATTERN]+ ($URL_SPECIAL_PATTERN)/ ) {
133
+ $text =~ s/($URL_HEAD_PATTERN)([$URL_BODY_PATTERN]+) ($URL_SPECIAL_PATTERN) {0,1}(($URL_SPECIAL_PATTERN? {0,1})+)/$1.$2.$3.&clean($4)/eg;
134
+ }
135
+
136
+ return $text;
137
+ }
138
+
139
+ sub clean {
140
+ my $text = shift(@_);
141
+ $text = s/ //g;
142
+ return $text;
143
+ }
144
+
145
+ 1;
@@ -0,0 +1,84 @@
1
+ require_relative 'base/version'
2
+
3
+ module Opener
4
+ module Tokenizers
5
+ class Base
6
+ attr_reader :language
7
+
8
+ def initialize(opts={})
9
+ @language ||= opts[:language] || lang
10
+ end
11
+
12
+ def command(opts=[])
13
+ "perl -I #{lib} #{kernel} #{language} #{opts.join(' ')}"
14
+ end
15
+
16
+ def run(opts=ARGV)
17
+ `#{command(opts)}`
18
+ end
19
+
20
+ def set_language(language)
21
+ @language = language
22
+ end
23
+
24
+ protected
25
+
26
+ def core_dir
27
+ File.expand_path("../../../../core", __FILE__)
28
+ end
29
+
30
+ def kernel
31
+ File.join(core_dir,'tokenizer-cli.pl')
32
+ end
33
+
34
+ def lib
35
+ File.join(core_dir,'lib/') # Trailing / is required
36
+ end
37
+
38
+ def language
39
+ return @language.nil? ? nil : "-l #{@language}"
40
+ end
41
+
42
+ def lang
43
+ 'en'
44
+ end
45
+
46
+ end
47
+
48
+ class EN < Base
49
+ def lang
50
+ 'en'
51
+ end
52
+ end
53
+
54
+ class DE < Base
55
+ def lang
56
+ 'de'
57
+ end
58
+ end
59
+
60
+ class NL < Base
61
+ def lang
62
+ 'nl'
63
+ end
64
+ end
65
+
66
+ class ES < Base
67
+ def lang
68
+ 'es'
69
+ end
70
+ end
71
+
72
+ class IT < Base
73
+ def lang
74
+ 'it'
75
+ end
76
+ end
77
+
78
+ class FR < Base
79
+ def lang
80
+ 'fr'
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,8 @@
1
+ module Opener
2
+ module Tokenizers
3
+ class Base
4
+ VERSION = "1.0.0"
5
+ end
6
+ end
7
+ end
8
+
@@ -0,0 +1,25 @@
1
+ require File.expand_path('../lib/opener/tokenizers/base/version', __FILE__)
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.name = 'opener-tokenizer-base'
5
+ gem.version = Opener::Tokenizers::Base::VERSION
6
+ gem.authors = ['development@olery.com']
7
+ gem.summary = 'Tokenize English, Dutch, German, Italian and Spanish to KAF'
8
+ gem.description = gem.summary
9
+ gem.homepage = 'http://opener-project.github.com/'
10
+ gem.has_rdoc = "yard"
11
+ gem.required_ruby_version = ">= 1.9.2"
12
+
13
+ gem.files = Dir.glob([
14
+ 'core/**/*',
15
+ 'lib/**/*',
16
+ '*.gemspec',
17
+ 'README.md'
18
+ ]).select { |file| File.file?(file) }
19
+
20
+ gem.executables = Dir.glob('bin/*').map { |file| File.basename(file) }
21
+
22
+ gem.add_development_dependency 'cucumber'
23
+ gem.add_development_dependency 'rspec'
24
+ gem.add_development_dependency 'rake'
25
+ end
metadata ADDED
@@ -0,0 +1,134 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: opener-tokenizer-base
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - development@olery.com
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-05-20 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: cucumber
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: Tokenize English, Dutch, German, Italian and Spanish to KAF
56
+ email:
57
+ executables:
58
+ - tokenizer-en
59
+ - tokenizer-it
60
+ - tokenizer-nl
61
+ - tokenizer-base
62
+ - tokenizer-es
63
+ - tokenizer-fr
64
+ - tokenizer-de
65
+ extensions: []
66
+ extra_rdoc_files: []
67
+ files:
68
+ - README.md
69
+ - bin/tokenizer-base
70
+ - bin/tokenizer-de
71
+ - bin/tokenizer-en
72
+ - bin/tokenizer-es
73
+ - bin/tokenizer-fr
74
+ - bin/tokenizer-it
75
+ - bin/tokenizer-nl
76
+ - core/lib/Data/OptList.pm
77
+ - core/lib/Params/Util.pm
78
+ - core/lib/Sub/Exporter.pm
79
+ - core/lib/Sub/Exporter/Cookbook.pod
80
+ - core/lib/Sub/Exporter/Tutorial.pod
81
+ - core/lib/Sub/Exporter/Util.pm
82
+ - core/lib/Sub/Install.pm
83
+ - core/lib/Time/Stamp.pm
84
+ - core/load-prefixes.pl
85
+ - core/nonbreaking_prefixes/abbreviation_list.kaf
86
+ - core/nonbreaking_prefixes/abbreviation_list.txt
87
+ - core/nonbreaking_prefixes/nonbreaking_prefix.ca
88
+ - core/nonbreaking_prefixes/nonbreaking_prefix.de
89
+ - core/nonbreaking_prefixes/nonbreaking_prefix.el
90
+ - core/nonbreaking_prefixes/nonbreaking_prefix.en
91
+ - core/nonbreaking_prefixes/nonbreaking_prefix.es
92
+ - core/nonbreaking_prefixes/nonbreaking_prefix.fr
93
+ - core/nonbreaking_prefixes/nonbreaking_prefix.is
94
+ - core/nonbreaking_prefixes/nonbreaking_prefix.it
95
+ - core/nonbreaking_prefixes/nonbreaking_prefix.nl
96
+ - core/nonbreaking_prefixes/nonbreaking_prefix.pl
97
+ - core/nonbreaking_prefixes/nonbreaking_prefix.pt
98
+ - core/nonbreaking_prefixes/nonbreaking_prefix.ro
99
+ - core/nonbreaking_prefixes/nonbreaking_prefix.ru
100
+ - core/nonbreaking_prefixes/nonbreaking_prefix.sk
101
+ - core/nonbreaking_prefixes/nonbreaking_prefix.sl
102
+ - core/nonbreaking_prefixes/nonbreaking_prefix.sv
103
+ - core/split-sentences.pl
104
+ - core/text-fixer.pl
105
+ - core/tokenizer-cli.pl
106
+ - core/tokenizer.pl
107
+ - lib/opener/tokenizers/base.rb
108
+ - lib/opener/tokenizers/base/version.rb
109
+ - opener-tokenizer-base.gemspec
110
+ homepage: http://opener-project.github.com/
111
+ licenses: []
112
+ metadata: {}
113
+ post_install_message:
114
+ rdoc_options: []
115
+ require_paths:
116
+ - lib
117
+ required_ruby_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ version: 1.9.2
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ requirements: []
128
+ rubyforge_project:
129
+ rubygems_version: 2.2.2
130
+ signing_key:
131
+ specification_version: 4
132
+ summary: Tokenize English, Dutch, German, Italian and Spanish to KAF
133
+ test_files: []
134
+ has_rdoc: yard