opener-tokenizer-base 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # Based on Preprocessor written by Philipp Koehn
4
+ # Changed by aazpeitia (aazpeitia@vicomtech.org)
5
+
6
+ use strict;
7
+
8
+ my %NONBREAKING_PREFIX = ();
9
+ my $LANGUAGE;
10
+
11
+ sub init_sentence_splitter {
12
+ $LANGUAGE = shift(@_);
13
+ %NONBREAKING_PREFIX = %{ shift(@_) };
14
+ }
15
+
16
+ sub split_sentences {
17
+
18
+ my $input_text = shift(@_);
19
+ chomp($input_text);
20
+ my $text = "";
21
+ if (/^<.+>$/ || /^\s*$/) {
22
+ #time to process this block, we've hit a blank or <p>
23
+ #&do_it_for($text,$input_text);
24
+ #print "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
25
+ #$text = "";
26
+ $text .= &do_it_for($text,$input_text);
27
+ $text .= "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
28
+ }
29
+ else {
30
+ #append the text, with a space
31
+ $text .= $input_text. " ";
32
+ }
33
+ $text = &do_it_for($text,$input_text);
34
+ return split("\n", $text);
35
+ }
36
+
37
+ sub do_it_for {
38
+ my($text,$markup) = @_;
39
+ $text = &preprocess($text) if $text;
40
+ #print "$markup\n" if ($markup =~ /^<.+>$/);
41
+ return $text;
42
+ }
43
+
44
+ sub preprocess {
45
+ #this is one paragraph
46
+ my($text) = @_;
47
+
48
+ # clean up spaces at head and tail of each line as well as any double-spacing
49
+ $text =~ s/ +/ /g;
50
+ $text =~ s/\n /\n/g;
51
+ $text =~ s/ \n/\n/g;
52
+ $text =~ s/^ //g;
53
+ $text =~ s/ $//g;
54
+
55
+ #####add sentence breaks as needed#####
56
+ #non-period end of sentence markers (?!) followed by sentence starters.
57
+
58
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
59
+ #$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
60
+ $text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\w])/$1\n$2/g;
61
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
62
+
63
+ #multi-dots followed by sentence starters
64
+ $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
65
+
66
+ # add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
67
+ $text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
68
+
69
+ # add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
70
+ $text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
71
+
72
+ # special punctuation cases are covered. Check all remaining periods.
73
+ my $word;
74
+ my $i;
75
+ my @words = split(/ /,$text);
76
+ $text = "";
77
+ for ($i=0;$i<(scalar(@words)-1);$i++) {
78
+ if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
79
+ #check if $1 is a known honorific and $2 is empty, never break
80
+ my $prefix = $1;
81
+ my $starting_punct = $2;
82
+ if($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
83
+
84
+ #not breaking;
85
+ } elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
86
+ #not breaking - upper case acronym
87
+ } elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
88
+ #the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
89
+ $words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
90
+ #we always add a return for these unless we have a numeric non-breaker and a number start
91
+ }
92
+
93
+ }
94
+ $text = $text.$words[$i]." ";
95
+ }
96
+
97
+ #we stopped one token from the end to allow for easy look-ahead. Append it now.
98
+ $text = $text.$words[$i];
99
+
100
+ # clean up spaces at head and tail of each line as well as any double-spacing
101
+ $text =~ s/ +/ /g;
102
+ $text =~ s/\n /\n/g;
103
+ $text =~ s/ \n/\n/g;
104
+ $text =~ s/^ //g;
105
+ $text =~ s/ $//g;
106
+
107
+ #add trailing break
108
+ $text .= "\n" unless $text =~ /\n$/;
109
+
110
+ return $text;
111
+
112
+ }
113
+
114
+ 1;
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # reads input text and fixes some mistakes
4
+ # developed by Andoni Azpeitia
5
+
6
+ use utf8;
7
+
8
+ my %NONBREAKING_PREFIX = ();
9
+ my $LANGUAGE;
10
+
11
+ my $START_QUOTES_REGEX = "“|‘|«|‹";
12
+ my $END_QUOTES_REGEX = "”|’|»|›";
13
+
14
+ sub init_text_fixer {
15
+ $LANGUAGE = shift(@_);
16
+ %NONBREAKING_PREFIX = %{ shift(@_) };
17
+ }
18
+
19
+ sub fix_text {
20
+
21
+ my($text) = shift(@_);
22
+
23
+ chomp($text);
24
+
25
+ #fix encoding
26
+ $text = &fix_encoding($text);
27
+
28
+
29
+ #word token method
30
+ my @words = split(/\s/,$text);
31
+ $text = "";
32
+ for (my $i=0;$i<(scalar(@words));$i++) {
33
+ my $word = $words[$i];
34
+ #Kumi Naidoo said: “bla bla bla.”Bla bla => Kumi Naidoo said: “bla bla bla”. Bla bla
35
+
36
+ if ( $word =~ /^(\S+)\.($END_QUOTES_REGEX)($START_QUOTES_REGEX*\p{IsUpper}\S*)$/ ) {
37
+ my $pre = $1;
38
+ my $quote = $2;
39
+ my $post = $3;
40
+
41
+ $word = $pre.$quote.". ".$post;
42
+ }
43
+ #to a "breach of trust." A German => to a "breach of trust". A German
44
+ elsif ( $word =~ /^(\S+)\.($END_QUOTES_REGEX)$/ ) {
45
+ my $pre = $1;
46
+ my $quote = $2;
47
+ if ( ($i<scalar(@words)-1 && $words[$i+1] =~ /^$START_QUOTES_REGEX*\p{IsUpper}\S*$/ )) {
48
+ $word = $pre.$quote.".";
49
+ }
50
+ elsif ($i==scalar(@words)-1) {
51
+ $word = $pre.$quote.".";
52
+ }
53
+ }
54
+ #OpeNER is amazing.OpeNER is cool. => OpeNER is amazing. OpeNER is cool.
55
+ elsif ( $word =~ /^(\S+)\.(\S+)$/) {
56
+ my $pre = $1;
57
+ my $post = $2;
58
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($post =~ /^[\p{IsLower}]/) ) {
59
+ #no change
60
+ } elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($post =~ /^[0-9]+/) ) {
61
+ #no change
62
+ } else {
63
+ $word = $pre.". ".$post;
64
+ }
65
+ }
66
+ #OpeNER is amazing .OpeNER is cool. => OpeNER is amazing. OpeNER is cool.
67
+ elsif ( $word =~ /^\.(\p{IsUpper}\S+)$/ ) {
68
+ my $post = $1;
69
+ if ( $i>0 && $words[$i-1] =~ /^(\S+)$/) {
70
+ $word = ". ".$post;
71
+ }
72
+ }
73
+ $text .= $word." ";
74
+ }
75
+ #freedoms." 'Outrageous'Although => freedoms". 'Outrageous' Although
76
+ #$text =~ s/(\")([^\"]+)(\. ?)(\")/$1$2$4$3/g;
77
+ #$text =~ s/(\')([^\']+)(\. ?)(\')/$1$2$4$3/g;
78
+ return $text;
79
+ }
80
+
81
+ sub fix_encoding {
82
+
83
+ my $text = shift(@_);
84
+
85
+ $text =~ s/’/'/g;
86
+ $text =~ s/À/À/g;
87
+ $text =~ s/“/“/g;
88
+ $text =~ s///g;
89
+ $text =~ s//\"/g;
90
+ ############################################
91
+ ############################################
92
+ $text =~ s/…/…/g; # elipsis
93
+ $text =~ s/…/…/g; # elipsis
94
+ $text =~ s/–/–/g; # long hyphen
95
+ $text =~ s/’/’/g; #curly apostrophe
96
+ $text =~ s/“/“/g; # curly open quote
97
+ $text =~ s/â€/”/g; # curly close quote
98
+ $text =~ s/»/»/g;
99
+ $text =~ s/«/«/g;
100
+ ############################################
101
+ $text =~ s/á/á/g;
102
+ $text =~ s/é/é/g;
103
+ $text =~ s/Ã\*/í/g;
104
+ $text =~ s/ó/ó/g;
105
+ $text =~ s/ú/ú/g;
106
+
107
+ $text =~ s/Á/Á/g;
108
+ $text =~ s/É/É/g;
109
+ $text =~ s/Í/Í/g;
110
+ $text =~ s/Ó/Ó/g;
111
+ $text =~ s/Ú/Ú/g;
112
+ ############################################
113
+ $text =~ s/ñ/ñ/g;
114
+ $text =~ s/ç/ç/g;
115
+ $text =~ s/Å“/œ/g;
116
+
117
+ $text =~ s/Ñ/Ñ/g;
118
+ $text =~ s/Ç/Ç/g;
119
+ $text =~ s/Å’/Œ/g;
120
+ ############################################
121
+ $text =~ s/©/©/g;
122
+ $text =~ s/®/®/g;
123
+ $text =~ s/â„¢/™/g;
124
+ $text =~ s/Ø/Ø/g;
125
+ $text =~ s/ª/ª/g;
126
+ ############################################
127
+ $text =~ s/ä/ä/g;
128
+ $text =~ s/ë/ë/g;
129
+ $text =~ s/ï/ï/g;
130
+ $text =~ s/ö/ö/g;
131
+ $text =~ s/ü/ü/g;
132
+
133
+ $text =~ s/Ä/Ä/g;
134
+ $text =~ s/Ë/Ë/g;
135
+ $text =~ s/Ï /Ï/g;
136
+ $text =~ s/Ö /Ö/g;
137
+ $text =~ s/Ü/Ü/g;
138
+ ############################################
139
+ $text =~ s/à/à/g;
140
+ $text =~ s/è/è/g;
141
+ $text =~ s/ì/ì/g;
142
+ $text =~ s/ò/ò/g;
143
+ $text =~ s/ù/ù/g;
144
+
145
+ $text =~ s/À/À/g;
146
+ $text =~ s/È/È/g;
147
+ $text =~ s/ÃŒ/Ì/g;
148
+ $text =~ s/Ã’/Ò/g;
149
+ $text =~ s/Ù/Ù/g;
150
+ ############################################
151
+ $text =~ s/â/â/g;
152
+ $text =~ s/ê/ê/g;
153
+ $text =~ s/î/î/g;
154
+ $text =~ s/ô/ô/g;
155
+ $text =~ s/û/û/g;
156
+
157
+ $text =~ s/Â/Â/g;
158
+ $text =~ s/Ê/Ê/g;
159
+ $text =~ s/ÃŽ/Î/g;
160
+ $text =~ s/Ô/Ô/g;
161
+ $text =~ s/Û/Û/g;
162
+ ############################################
163
+ $text =~ s/É/E/g;
164
+
165
+
166
+ return $text;
167
+ }
168
+
169
+ 1;
@@ -0,0 +1,363 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # Sample Tokenizer
4
+ # written by Josh Schroeder, based on code by Philipp Koehn
5
+ # changed by Haritz Arzelus (#2012/11/19) Aitor García and Andoni Azpeitia
6
+
7
+ use FindBin;
8
+
9
+ use lib "$FindBin::Bin/lib";
10
+
11
+ use Encode::Guess;
12
+ use Time::Stamp;
13
+
14
+ require "$FindBin::Bin"."/text-fixer.pl";
15
+ require "$FindBin::Bin"."/split-sentences.pl";
16
+ require "$FindBin::Bin"."/tokenizer.pl";
17
+ require "$FindBin::Bin"."/load-prefixes.pl";
18
+
19
+ no warnings;
20
+ use encoding 'utf8';
21
+
22
+ binmode(STDIN, ":utf8");
23
+ binmode(STDOUT, ":utf8");
24
+ binmode(STDERR, ":utf8");
25
+
26
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
27
+ #use FindBin qw($Bin);
28
+ #use strict;
29
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
30
+ #use Time::HiRes;
31
+
32
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
33
+ #my $mydir = "$Bin/nonbreaking_prefixes";
34
+ #changed by me (aitor) to point to the directory of the script, instead of current working directory
35
+ #my $mydir = "$FindBin::Bin"."/nonbreaking_prefixes";
36
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
37
+
38
+ #my $start = [ Time::HiRes::gettimeofday( ) ];
39
+
40
+ #while (@ARGV) {
41
+ # $_ = shift;
42
+ # /^-l$/ && ($language = shift, next);
43
+ # /^-q$/ && ($QUIET = 1, next);
44
+ # /^-h$/ && ($HELP = 1, next);
45
+ #}
46
+
47
+ #if (!$QUIET) {
48
+ # print STDERR "Tokenizer v3\n";
49
+ # print STDERR "Language: $language\n";
50
+ #}
51
+
52
+ #argument variables
53
+ my %NONBREAKING_PREFIX = ();
54
+ my $SENT_VERSION = "0.0.1";
55
+ my $TOK_VERSION = "1.0.1";
56
+ my $FILE = "";
57
+ my $LANGUAGE;
58
+ my $NOTIMESTAMP = 0;
59
+ my $HELP = 0;
60
+
61
+ my $SUBSTITUTE = "####";
62
+
63
+ if (checkArguments(\@ARGV) == 1) {
64
+ if ($HELP == 1) {
65
+ displayHelp();
66
+ exit 0;
67
+ }
68
+ }
69
+ else {
70
+ displayHelp();
71
+ exit -1;
72
+ }
73
+
74
+ # load nonbreaking prefixes and init both tokenizer and sentence splitter
75
+ %NONBREAKING_PREFIX = %{ &load_prefixes($LANGUAGE) };
76
+
77
+ &init_text_fixer($LANGUAGE, \%NONBREAKING_PREFIX);
78
+ &init_sentence_splitter($LANGUAGE, \%NONBREAKING_PREFIX);
79
+ &init_tokenizer($LANGUAGE, \%NONBREAKING_PREFIX);
80
+
81
+
82
+ # get timestamp
83
+ my $timestamp = "0000-00-00T00:00:00Z";
84
+ if ($NOTIMESTAMP == 0) {
85
+ $timestamp = timestamp();
86
+ }
87
+
88
+ # print kaf header
89
+ if ($FILE ne "") {
90
+
91
+ my $i = rindex($FILE, ".");
92
+ my $filename = substr($FILE, 0, $i);
93
+ my $filetype = uc(substr($FILE, $i+1, length($FILE)-length($filename)-1));
94
+ print_kafheader($filename, $filetype, $timestamp, $LANGUAGE);
95
+ }
96
+ else {
97
+ print_kafheader_nofile($timestamp);
98
+ }
99
+ print " <text>\n";
100
+
101
+ # process text
102
+ my $sent = 1;
103
+ my $para = 1;
104
+ my $counter = 0;
105
+ my $charcount = 0;
106
+ while(<STDIN>) {
107
+
108
+ if (/^<.+>$/ || /^\s*$/) {
109
+ #don't try to tokenize XML/HTML tag lines
110
+ chomp($_);
111
+ #print $_;
112
+ }
113
+ else {
114
+
115
+ #fix input text
116
+ my $text = &fix_text($_);
117
+
118
+ #split sentences
119
+ my @sentences = &split_sentences($text);
120
+ my $index = 0;
121
+ my $last_index = 0;
122
+ my $last_offset = -1;
123
+ my $j = 0;
124
+ foreach my $sentence (@sentences) {
125
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
126
+ #print &tokenize($_);
127
+ $tok = tokenize($sentence);
128
+ #tokenize some especial characters
129
+ $tok =~ s/([“|”|…|—|–|«|»])([^ ])/$1 $2/g;
130
+ $tok =~ s/([^ ])([“|”|…|—|–|«|»])/$1 $2/g;
131
+ #detokenize tokens with @
132
+ $tok =~ s/ @ /@/g;
133
+ #detokenize some tokens with '
134
+ $tok =~ s/([DLNO]) '/$1'/g;
135
+ #$tok =~ s/([DLNO])' /$1'/g; cambiado por Andoni Azpeitia Vicomtech L' armée => L'armée
136
+ $tok =~ s/o( )?'( )?clock/o'clock/g;
137
+ $tok =~ s/ ' ([0-9][0-9]s)/ '$1/g;
138
+ #detokenize some time formats
139
+ $tok =~ s/([0-9][0-9]*) ' ([0-9][0-9]*) "/$1'$2"/g;
140
+ $tok =~ s/([0-9][0-9]*) : ([0-9][0-9])/$1:$2/g;
141
+ #detokenize some height formats
142
+ $tok =~ s/([0-9][0-9]*) ' ([0-9][0-9])/$1'$2/g;
143
+ #tokenize two dashes
144
+ $tok =~ s/\-\-/ \-\-/g;
145
+ #correct ºC tokenization
146
+ $tok =~ s/([0-9])( )?º( )?C/$1 ºC/g;
147
+ $tok =~ s/ +/ /g;
148
+ #<<<<<<<<<<<<<<<
149
+ #changed by me (aitor) to format the output as a kind of dummy KAF format
150
+ chomp($tok);
151
+
152
+ @tokens = split(/ /, $tok);
153
+
154
+ my $i = 0;
155
+ foreach my $token (@tokens) {
156
+
157
+ $index = index($_, $token, $last_index);
158
+
159
+ #if token was substituted at tokenization, be careful
160
+ if ( $token eq "\"".$SUBSTITUTE ) {
161
+ $index = index($_, "'", $last_index);
162
+ $token = "\"";
163
+ }
164
+ elsif ( $token eq "\'".$SUBSTITUTE ) {
165
+ $index = index($_, "`", $last_index);
166
+ $token = "\'";
167
+ }
168
+
169
+ my $offset = $charcount + $index;
170
+ #if input text has been preprocesed and tokens has been moved ( 'hello.' => 'hello'.),
171
+ # offset of the "." char is at the left of "'" char not at the right
172
+ if ( $index==-1 ) {
173
+ $index = index($_, $token, $last_index-2);
174
+ $offset = $charcount + $index;
175
+ }
176
+ #make sure that found offset is not an offset of the same char at other position so,
177
+ # find offset of the next token and compare
178
+ # next token is at the same sentence
179
+ elsif ( $i<scalar(@tokens)-1 ) {
180
+ my $next_token_index = $charcount + index($_, $tokens[$i+1], length($token) + $last_index-1);
181
+ if ( $index==-1 || ($next_token_index > -1 && $offset > $next_token_index+1) ) {
182
+ $index = index($_, $token, $last_index-2);
183
+ $offset = $charcount + $index;
184
+ }
185
+ }
186
+ #make sure that found offset is not an offset of the same char at other position so,
187
+ # find offset of the next token and compare
188
+ # next token is at next sentence
189
+ elsif ( $sent < scalar(@sentences)) {
190
+ my $next_sentence = $sentences[$j+1];
191
+ $next_sentence = &tokenize($next_sentence);
192
+ my @next_tokens = split(/ /, $next_sentence);
193
+ my $next_token = $next_tokens[0];
194
+ my $next_token_index = $charcount + index($_, $next_token, length($token) + $last_index-1);
195
+ if ( ($next_token_index > -1 && $offset > $next_token_index+1) ) {
196
+ $index = index($_, $token, $last_index-2);
197
+ $offset = $charcount + $index;
198
+ }
199
+ }
200
+
201
+ my $token_length = length($token);
202
+ &print_line(++$counter, $sent, $para, $offset, $token_length, $token);
203
+
204
+ $last_index = $index + $token_length;
205
+ $last_offset = $offset;
206
+ $i++;
207
+ }
208
+
209
+ #>>>>>>>>>>>>>>>
210
+ $j++;
211
+ $sent++;
212
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
213
+ }#foreach sentence
214
+
215
+ if (length($_) == 0) {
216
+ $charcount += 1;
217
+ }
218
+ else {
219
+ $charcount += length($_);
220
+ }
221
+ $para++;
222
+ }
223
+ }#while(<STDIN>)
224
+ print " </text>\n";
225
+ print "</KAF>\n";
226
+
227
+
228
+ #prints word form in kaf format
229
+ sub print_line {
230
+
231
+ my $wid=shift(@_);
232
+ my $sent=shift(@_);
233
+ my $para=shift(@_);
234
+ my $offset=shift(@_);
235
+ my $length=shift(@_);
236
+ my $token=shift(@_);
237
+
238
+ print " <wf wid=\"w".$wid."\" sent=\"".$sent."\" para=\"".$para."\" offset=\"".$offset."\" length=\"". $length."\"><![CDATA[".$token."]]></wf>\n";
239
+ }
240
+
241
+ #prints kaf xml fomat header
242
+ sub print_kafheader {
243
+ my $filename = shift(@_);
244
+ my $filetype = shift(@_);
245
+ my $timestamp = shift(@_);
246
+ my $LANGUAGE = shift(@_);
247
+ print "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
248
+ print "<KAF xml:lang=\"".$LANGUAGE."\" version=\"v1.opener\">\n";
249
+ print " <kafHeader>\n";
250
+ print " <fileDesc filename=\"".$filename."\" filetype=\"".$filetype."\" />\n";
251
+ print " <linguisticProcessors layer=\"text\">\n";
252
+ print " <lp name=\"opener-sentence-splitter-$LANGUAGE\" version=\"".$SENT_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
253
+ print " <lp name=\"opener-tokenizer-$LANGUAGE\" version=\"".$TOK_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
254
+ print " </linguisticProcessors>\n";
255
+ print " </kafHeader>\n";
256
+ }
257
+ #prints kaf xml fomat header whithout filedesc
258
+ sub print_kafheader_nofile {
259
+ my $timestamp = shift(@_);
260
+ print "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
261
+ print "<KAF xml:lang=\"".$LANGUAGE."\" version=\"v1.opener\">\n";
262
+ print " <kafHeader>\n";
263
+ print " <fileDesc />\n";
264
+ print " <linguisticProcessors layer=\"text\">\n";
265
+ print " <lp name=\"opener-sentence-splitter-$LANGUAGE\" version=\"".$SENT_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
266
+ print " <lp name=\"opener-tokenizer-$LANGUAGE\" version=\"".$TOK_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
267
+ print " </linguisticProcessors>\n";
268
+ print " </kafHeader>\n";
269
+ }
270
+
271
+ sub checkArguments {
272
+ my $argref = shift(@_);
273
+ my @arg = @ { $argref };
274
+ my $correct = 1;
275
+ if (scalar(@arg) > 0) {
276
+ for (my $i = 0; $i < scalar(@arg); $i++) {
277
+ if (lc($arg[$i]) eq "-l") {
278
+ if(scalar(@arg) > $i+1) {
279
+ if (lc($arg[$i+1]) ne "-t" && lc($arg[$i+1]) ne "-f" && lc($arg[$i+1]) ne "-l" && checkLanguage($arg[$i+1]) == 1) {
280
+ $LANGUAGE = $arg[$i+1];
281
+ }
282
+ else {
283
+ $correct = 0;
284
+ print STDERR "Error: language \"".$arg[$i+1]."\" not supported\n";
285
+ }
286
+ }
287
+ else {
288
+ $correct = 0;
289
+ print STDERR "Error: language don't specified\n";
290
+ }
291
+ }
292
+ elsif (lc($arg[$i]) eq "-f") {
293
+ if(scalar(@arg) > $i+1 && lc($arg[$i+1]) ne "-t" && lc($arg[$i+1]) ne "-f" && lc($arg[$i+1]) ne "-l") {
294
+ $FILE = $arg[$i+1];
295
+ }
296
+ else {
297
+ $correct = 0;
298
+ print STDERR "Error: file's name empty\n";
299
+ }
300
+ }
301
+ elsif (lc($arg[$i]) eq "-t") {
302
+ $NOTIMESTAMP = 1;
303
+ }
304
+ elsif (lc($arg[$i]) eq "--help") {
305
+ $HELP = 1;
306
+ }
307
+ }
308
+ if ($LANGUAGE ne "") {
309
+ return $correct;
310
+ }
311
+ else {
312
+ print STDERR "Error: language don't specified\n";
313
+ return 0;
314
+ }
315
+ }
316
+ else {
317
+ print STDERR "Error: language don't specified\n";
318
+ return 0
319
+ }
320
+ }
321
+
322
+ sub checkLanguage {
323
+ my $language = shift(@_);
324
+ if ($language eq "en") { return 1; }
325
+ elsif ($language eq "es") { return 1; }
326
+ elsif ($language eq "fr") { return 1; }
327
+ elsif ($language eq "it") { return 1; }
328
+ elsif ($language eq "de") { return 1; }
329
+ elsif ($language eq "nl") { return 1; }
330
+ else { return -1 }
331
+ }
332
+
333
+ sub displayHelp {
334
+ print STDERR "\nThis aplication reads a text from standard input in order to tokenize.\n";
335
+ print STDERR "Application arguments:\n";
336
+ print STDERR "-l, --language input text's language.\n";
337
+ print STDERR "-f, --filename (optional) file's name.\n";
338
+ print STDERR "-t, (optional) o use static timestamp at KAF header.\n";
339
+ print STDERR "--help, outputs aplication help.\n";
340
+ }
341
+
342
+ sub timestamp {
343
+ my $time = Time::Stamp::gmstamp();
344
+ return $time;
345
+ }
346
+
347
+ sub detect_encoding {
348
+ my $file = shift(@_);
349
+ my $enc;
350
+ open(FILE,$file);
351
+ binmode(FILE);
352
+ if(read(FILE,my $filestart, 500)) {
353
+ $enc = guess_encoding($filestart);
354
+ }
355
+ close(FILE);
356
+ if (ref($enc)) {
357
+ return $enc->name;
358
+ }
359
+ else {
360
+ return "utf8";
361
+ }
362
+ }
363
+