opener-tokenizer-base 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # Based on Preprocessor written by Philipp Koehn
4
+ # Changed by aazpeitia (aazpeitia@vicomtech.org)
5
+
6
+ use strict;
7
+
8
+ my %NONBREAKING_PREFIX = ();
9
+ my $LANGUAGE;
10
+
11
+ sub init_sentence_splitter {
12
+ $LANGUAGE = shift(@_);
13
+ %NONBREAKING_PREFIX = %{ shift(@_) };
14
+ }
15
+
16
+ sub split_sentences {
17
+
18
+ my $input_text = shift(@_);
19
+ chomp($input_text);
20
+ my $text = "";
21
+ if (/^<.+>$/ || /^\s*$/) {
22
+ #time to process this block, we've hit a blank or <p>
23
+ #&do_it_for($text,$input_text);
24
+ #print "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
25
+ #$text = "";
26
+ $text .= &do_it_for($text,$input_text);
27
+ $text .= "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
28
+ }
29
+ else {
30
+ #append the text, with a space
31
+ $text .= $input_text. " ";
32
+ }
33
+ $text = &do_it_for($text,$input_text);
34
+ return split("\n", $text);
35
+ }
36
+
37
+ sub do_it_for {
38
+ my($text,$markup) = @_;
39
+ $text = &preprocess($text) if $text;
40
+ #print "$markup\n" if ($markup =~ /^<.+>$/);
41
+ return $text;
42
+ }
43
+
44
+ sub preprocess {
45
+ #this is one paragraph
46
+ my($text) = @_;
47
+
48
+ # clean up spaces at head and tail of each line as well as any double-spacing
49
+ $text =~ s/ +/ /g;
50
+ $text =~ s/\n /\n/g;
51
+ $text =~ s/ \n/\n/g;
52
+ $text =~ s/^ //g;
53
+ $text =~ s/ $//g;
54
+
55
+ #####add sentence breaks as needed#####
56
+ #non-period end of sentence markers (?!) followed by sentence starters.
57
+
58
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
59
+ #$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
60
+ $text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\w])/$1\n$2/g;
61
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
62
+
63
+ #multi-dots followed by sentence starters
64
+ $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
65
+
66
+ # add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
67
+ $text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
68
+
69
+ # add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
70
+ $text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
71
+
72
+ # special punctuation cases are covered. Check all remaining periods.
73
+ my $word;
74
+ my $i;
75
+ my @words = split(/ /,$text);
76
+ $text = "";
77
+ for ($i=0;$i<(scalar(@words)-1);$i++) {
78
+ if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
79
+ #check if $1 is a known honorific and $2 is empty, never break
80
+ my $prefix = $1;
81
+ my $starting_punct = $2;
82
+ if($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
83
+
84
+ #not breaking;
85
+ } elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
86
+ #not breaking - upper case acronym
87
+ } elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
88
+ #the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
89
+ $words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
90
+ #we always add a return for these unless we have a numeric non-breaker and a number start
91
+ }
92
+
93
+ }
94
+ $text = $text.$words[$i]." ";
95
+ }
96
+
97
+ #we stopped one token from the end to allow for easy look-ahead. Append it now.
98
+ $text = $text.$words[$i];
99
+
100
+ # clean up spaces at head and tail of each line as well as any double-spacing
101
+ $text =~ s/ +/ /g;
102
+ $text =~ s/\n /\n/g;
103
+ $text =~ s/ \n/\n/g;
104
+ $text =~ s/^ //g;
105
+ $text =~ s/ $//g;
106
+
107
+ #add trailing break
108
+ $text .= "\n" unless $text =~ /\n$/;
109
+
110
+ return $text;
111
+
112
+ }
113
+
114
+ 1;
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # reads input text and fixes some mistakes
4
+ # developed by Andoni Azpeitia
5
+
6
+ use utf8;
7
+
8
+ my %NONBREAKING_PREFIX = ();
9
+ my $LANGUAGE;
10
+
11
+ my $START_QUOTES_REGEX = "“|‘|«|‹";
12
+ my $END_QUOTES_REGEX = "”|’|»|›";
13
+
14
+ sub init_text_fixer {
15
+ $LANGUAGE = shift(@_);
16
+ %NONBREAKING_PREFIX = %{ shift(@_) };
17
+ }
18
+
19
+ sub fix_text {
20
+
21
+ my($text) = shift(@_);
22
+
23
+ chomp($text);
24
+
25
+ #fix encoding
26
+ $text = &fix_encoding($text);
27
+
28
+
29
+ #word token method
30
+ my @words = split(/\s/,$text);
31
+ $text = "";
32
+ for (my $i=0;$i<(scalar(@words));$i++) {
33
+ my $word = $words[$i];
34
+ #Kumi Naidoo said: “bla bla bla.”Bla bla => Kumi Naidoo said: “bla bla bla”. Bla bla
35
+
36
+ if ( $word =~ /^(\S+)\.($END_QUOTES_REGEX)($START_QUOTES_REGEX*\p{IsUpper}\S*)$/ ) {
37
+ my $pre = $1;
38
+ my $quote = $2;
39
+ my $post = $3;
40
+
41
+ $word = $pre.$quote.". ".$post;
42
+ }
43
+ #to a "breach of trust." A German => to a "breach of trust". A German
44
+ elsif ( $word =~ /^(\S+)\.($END_QUOTES_REGEX)$/ ) {
45
+ my $pre = $1;
46
+ my $quote = $2;
47
+ if ( ($i<scalar(@words)-1 && $words[$i+1] =~ /^$START_QUOTES_REGEX*\p{IsUpper}\S*$/ )) {
48
+ $word = $pre.$quote.".";
49
+ }
50
+ elsif ($i==scalar(@words)-1) {
51
+ $word = $pre.$quote.".";
52
+ }
53
+ }
54
+ #OpeNER is amazing.OpeNER is cool. => OpeNER is amazing. OpeNER is cool.
55
+ elsif ( $word =~ /^(\S+)\.(\S+)$/) {
56
+ my $pre = $1;
57
+ my $post = $2;
58
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($post =~ /^[\p{IsLower}]/) ) {
59
+ #no change
60
+ } elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($post =~ /^[0-9]+/) ) {
61
+ #no change
62
+ } else {
63
+ $word = $pre.". ".$post;
64
+ }
65
+ }
66
+ #OpeNER is amazing .OpeNER is cool. => OpeNER is amazing. OpeNER is cool.
67
+ elsif ( $word =~ /^\.(\p{IsUpper}\S+)$/ ) {
68
+ my $post = $1;
69
+ if ( $i>0 && $words[$i-1] =~ /^(\S+)$/) {
70
+ $word = ". ".$post;
71
+ }
72
+ }
73
+ $text .= $word." ";
74
+ }
75
+ #freedoms." 'Outrageous'Although => freedoms". 'Outrageous' Although
76
+ #$text =~ s/(\")([^\"]+)(\. ?)(\")/$1$2$4$3/g;
77
+ #$text =~ s/(\')([^\']+)(\. ?)(\')/$1$2$4$3/g;
78
+ return $text;
79
+ }
80
+
81
+ sub fix_encoding {
82
+
83
+ my $text = shift(@_);
84
+
85
+ $text =~ s/’/'/g;
86
+ $text =~ s/À/À/g;
87
+ $text =~ s/“/“/g;
88
+ $text =~ s///g;
89
+ $text =~ s//\"/g;
90
+ ############################################
91
+ ############################################
92
+ $text =~ s/…/…/g; # elipsis
93
+ $text =~ s/…/…/g; # elipsis
94
+ $text =~ s/–/–/g; # long hyphen
95
+ $text =~ s/’/’/g; #curly apostrophe
96
+ $text =~ s/“/“/g; # curly open quote
97
+ $text =~ s/â€/”/g; # curly close quote
98
+ $text =~ s/»/»/g;
99
+ $text =~ s/«/«/g;
100
+ ############################################
101
+ $text =~ s/á/á/g;
102
+ $text =~ s/é/é/g;
103
+ $text =~ s/Ã\*/í/g;
104
+ $text =~ s/ó/ó/g;
105
+ $text =~ s/ú/ú/g;
106
+
107
+ $text =~ s/Á/Á/g;
108
+ $text =~ s/É/É/g;
109
+ $text =~ s/Í/Í/g;
110
+ $text =~ s/Ó/Ó/g;
111
+ $text =~ s/Ú/Ú/g;
112
+ ############################################
113
+ $text =~ s/ñ/ñ/g;
114
+ $text =~ s/ç/ç/g;
115
+ $text =~ s/Å“/œ/g;
116
+
117
+ $text =~ s/Ñ/Ñ/g;
118
+ $text =~ s/Ç/Ç/g;
119
+ $text =~ s/Å’/Œ/g;
120
+ ############################################
121
+ $text =~ s/©/©/g;
122
+ $text =~ s/®/®/g;
123
+ $text =~ s/â„¢/™/g;
124
+ $text =~ s/Ø/Ø/g;
125
+ $text =~ s/ª/ª/g;
126
+ ############################################
127
+ $text =~ s/ä/ä/g;
128
+ $text =~ s/ë/ë/g;
129
+ $text =~ s/ï/ï/g;
130
+ $text =~ s/ö/ö/g;
131
+ $text =~ s/ü/ü/g;
132
+
133
+ $text =~ s/Ä/Ä/g;
134
+ $text =~ s/Ë/Ë/g;
135
+ $text =~ s/Ï /Ï/g;
136
+ $text =~ s/Ö /Ö/g;
137
+ $text =~ s/Ãœ/Ü/g;
138
+ ############################################
139
+ $text =~ s/à/à/g;
140
+ $text =~ s/è/è/g;
141
+ $text =~ s/ì/ì/g;
142
+ $text =~ s/ò/ò/g;
143
+ $text =~ s/ù/ù/g;
144
+
145
+ $text =~ s/À/À/g;
146
+ $text =~ s/È/È/g;
147
+ $text =~ s/ÃŒ/Ì/g;
148
+ $text =~ s/Ã’/Ò/g;
149
+ $text =~ s/Ù/Ù/g;
150
+ ############################################
151
+ $text =~ s/â/â/g;
152
+ $text =~ s/ê/ê/g;
153
+ $text =~ s/î/î/g;
154
+ $text =~ s/ô/ô/g;
155
+ $text =~ s/û/û/g;
156
+
157
+ $text =~ s/Â/Â/g;
158
+ $text =~ s/Ê/Ê/g;
159
+ $text =~ s/ÃŽ/Î/g;
160
+ $text =~ s/Ô/Ô/g;
161
+ $text =~ s/Û/Û/g;
162
+ ############################################
163
+ $text =~ s/É/E/g;
164
+
165
+
166
+ return $text;
167
+ }
168
+
169
+ 1;
@@ -0,0 +1,363 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # Sample Tokenizer
4
+ # written by Josh Schroeder, based on code by Philipp Koehn
5
+ # changed by Haritz Arzelus (#2012/11/19) Aitor García and Andoni Azpeitia
6
+
7
+ use FindBin;
8
+
9
+ use lib "$FindBin::Bin/lib";
10
+
11
+ use Encode::Guess;
12
+ use Time::Stamp;
13
+
14
+ require "$FindBin::Bin"."/text-fixer.pl";
15
+ require "$FindBin::Bin"."/split-sentences.pl";
16
+ require "$FindBin::Bin"."/tokenizer.pl";
17
+ require "$FindBin::Bin"."/load-prefixes.pl";
18
+
19
+ no warnings;
20
+ use encoding 'utf8';
21
+
22
+ binmode(STDIN, ":utf8");
23
+ binmode(STDOUT, ":utf8");
24
+ binmode(STDERR, ":utf8");
25
+
26
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
27
+ #use FindBin qw($Bin);
28
+ #use strict;
29
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
30
+ #use Time::HiRes;
31
+
32
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
33
+ #my $mydir = "$Bin/nonbreaking_prefixes";
34
+ #changed by me (aitor) to point to the directory of the script, instead of current working directory
35
+ #my $mydir = "$FindBin::Bin"."/nonbreaking_prefixes";
36
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
37
+
38
+ #my $start = [ Time::HiRes::gettimeofday( ) ];
39
+
40
+ #while (@ARGV) {
41
+ # $_ = shift;
42
+ # /^-l$/ && ($language = shift, next);
43
+ # /^-q$/ && ($QUIET = 1, next);
44
+ # /^-h$/ && ($HELP = 1, next);
45
+ #}
46
+
47
+ #if (!$QUIET) {
48
+ # print STDERR "Tokenizer v3\n";
49
+ # print STDERR "Language: $language\n";
50
+ #}
51
+
52
+ #argument variables
53
+ my %NONBREAKING_PREFIX = ();
54
+ my $SENT_VERSION = "0.0.1";
55
+ my $TOK_VERSION = "1.0.1";
56
+ my $FILE = "";
57
+ my $LANGUAGE;
58
+ my $NOTIMESTAMP = 0;
59
+ my $HELP = 0;
60
+
61
+ my $SUBSTITUTE = "####";
62
+
63
+ if (checkArguments(\@ARGV) == 1) {
64
+ if ($HELP == 1) {
65
+ displayHelp();
66
+ exit 0;
67
+ }
68
+ }
69
+ else {
70
+ displayHelp();
71
+ exit -1;
72
+ }
73
+
74
+ # load nonbreaking prefixes and init both tokenizer and sentence splitter
75
+ %NONBREAKING_PREFIX = %{ &load_prefixes($LANGUAGE) };
76
+
77
+ &init_text_fixer($LANGUAGE, \%NONBREAKING_PREFIX);
78
+ &init_sentence_splitter($LANGUAGE, \%NONBREAKING_PREFIX);
79
+ &init_tokenizer($LANGUAGE, \%NONBREAKING_PREFIX);
80
+
81
+
82
+ # get timestamp
83
+ my $timestamp = "0000-00-00T00:00:00Z";
84
+ if ($NOTIMESTAMP == 0) {
85
+ $timestamp = timestamp();
86
+ }
87
+
88
+ # print kaf header
89
+ if ($FILE ne "") {
90
+
91
+ my $i = rindex($FILE, ".");
92
+ my $filename = substr($FILE, 0, $i);
93
+ my $filetype = uc(substr($FILE, $i+1, length($FILE)-length($filename)-1));
94
+ print_kafheader($filename, $filetype, $timestamp, $LANGUAGE);
95
+ }
96
+ else {
97
+ print_kafheader_nofile($timestamp);
98
+ }
99
+ print " <text>\n";
100
+
101
+ # process text
102
+ my $sent = 1;
103
+ my $para = 1;
104
+ my $counter = 0;
105
+ my $charcount = 0;
106
+ while(<STDIN>) {
107
+
108
+ if (/^<.+>$/ || /^\s*$/) {
109
+ #don't try to tokenize XML/HTML tag lines
110
+ chomp($_);
111
+ #print $_;
112
+ }
113
+ else {
114
+
115
+ #fix input text
116
+ my $text = &fix_text($_);
117
+
118
+ #split sentences
119
+ my @sentences = &split_sentences($text);
120
+ my $index = 0;
121
+ my $last_index = 0;
122
+ my $last_offset = -1;
123
+ my $j = 0;
124
+ foreach my $sentence (@sentences) {
125
+ #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
126
+ #print &tokenize($_);
127
+ $tok = tokenize($sentence);
128
+ #tokenize some especial characters
129
+ $tok =~ s/([“|”|…|—|–|«|»])([^ ])/$1 $2/g;
130
+ $tok =~ s/([^ ])([“|”|…|—|–|«|»])/$1 $2/g;
131
+ #detokenize tokens with @
132
+ $tok =~ s/ @ /@/g;
133
+ #detokenize some tokens with '
134
+ $tok =~ s/([DLNO]) '/$1'/g;
135
+ #$tok =~ s/([DLNO])' /$1'/g; cambiado por Andoni Azpeitia Vicomtech L' armée => L'armée
136
+ $tok =~ s/o( )?'( )?clock/o'clock/g;
137
+ $tok =~ s/ ' ([0-9][0-9]s)/ '$1/g;
138
+ #detokenize some time formats
139
+ $tok =~ s/([0-9][0-9]*) ' ([0-9][0-9]*) "/$1'$2"/g;
140
+ $tok =~ s/([0-9][0-9]*) : ([0-9][0-9])/$1:$2/g;
141
+ #detokenize some height formats
142
+ $tok =~ s/([0-9][0-9]*) ' ([0-9][0-9])/$1'$2/g;
143
+ #tokenize two dashes
144
+ $tok =~ s/\-\-/ \-\-/g;
145
+ #correct ºC tokenization
146
+ $tok =~ s/([0-9])( )?º( )?C/$1 ºC/g;
147
+ $tok =~ s/ +/ /g;
148
+ #<<<<<<<<<<<<<<<
149
+ #changed by me (aitor) to format the output as a kind of dummy KAF format
150
+ chomp($tok);
151
+
152
+ @tokens = split(/ /, $tok);
153
+
154
+ my $i = 0;
155
+ foreach my $token (@tokens) {
156
+
157
+ $index = index($_, $token, $last_index);
158
+
159
+ #if token was substituted at tokenization, be careful
160
+ if ( $token eq "\"".$SUBSTITUTE ) {
161
+ $index = index($_, "'", $last_index);
162
+ $token = "\"";
163
+ }
164
+ elsif ( $token eq "\'".$SUBSTITUTE ) {
165
+ $index = index($_, "`", $last_index);
166
+ $token = "\'";
167
+ }
168
+
169
+ my $offset = $charcount + $index;
170
+ #if input text has been preprocesed and tokens has been moved ( 'hello.' => 'hello'.),
171
+ # offset of the "." char is at the left of "'" char not at the right
172
+ if ( $index==-1 ) {
173
+ $index = index($_, $token, $last_index-2);
174
+ $offset = $charcount + $index;
175
+ }
176
+ #make sure that found offset is not an offset of the same char at other position so,
177
+ # find offset of the next token and compare
178
+ # next token is at the same sentence
179
+ elsif ( $i<scalar(@tokens)-1 ) {
180
+ my $next_token_index = $charcount + index($_, $tokens[$i+1], length($token) + $last_index-1);
181
+ if ( $index==-1 || ($next_token_index > -1 && $offset > $next_token_index+1) ) {
182
+ $index = index($_, $token, $last_index-2);
183
+ $offset = $charcount + $index;
184
+ }
185
+ }
186
+ #make sure that found offset is not an offset of the same char at other position so,
187
+ # find offset of the next token and compare
188
+ # next token is at next sentence
189
+ elsif ( $sent < scalar(@sentences)) {
190
+ my $next_sentence = $sentences[$j+1];
191
+ $next_sentence = &tokenize($next_sentence);
192
+ my @next_tokens = split(/ /, $next_sentence);
193
+ my $next_token = $next_tokens[0];
194
+ my $next_token_index = $charcount + index($_, $next_token, length($token) + $last_index-1);
195
+ if ( ($next_token_index > -1 && $offset > $next_token_index+1) ) {
196
+ $index = index($_, $token, $last_index-2);
197
+ $offset = $charcount + $index;
198
+ }
199
+ }
200
+
201
+ my $token_length = length($token);
202
+ &print_line(++$counter, $sent, $para, $offset, $token_length, $token);
203
+
204
+ $last_index = $index + $token_length;
205
+ $last_offset = $offset;
206
+ $i++;
207
+ }
208
+
209
+ #>>>>>>>>>>>>>>>
210
+ $j++;
211
+ $sent++;
212
+ #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
213
+ }#foreach sentence
214
+
215
+ if (length($_) == 0) {
216
+ $charcount += 1;
217
+ }
218
+ else {
219
+ $charcount += length($_);
220
+ }
221
+ $para++;
222
+ }
223
+ }#while(<STDIN>)
224
+ print " </text>\n";
225
+ print "</KAF>\n";
226
+
227
+
228
+ #prints word form in kaf format
229
+ sub print_line {
230
+
231
+ my $wid=shift(@_);
232
+ my $sent=shift(@_);
233
+ my $para=shift(@_);
234
+ my $offset=shift(@_);
235
+ my $length=shift(@_);
236
+ my $token=shift(@_);
237
+
238
+ print " <wf wid=\"w".$wid."\" sent=\"".$sent."\" para=\"".$para."\" offset=\"".$offset."\" length=\"". $length."\"><![CDATA[".$token."]]></wf>\n";
239
+ }
240
+
241
+ #prints kaf xml fomat header
242
+ sub print_kafheader {
243
+ my $filename = shift(@_);
244
+ my $filetype = shift(@_);
245
+ my $timestamp = shift(@_);
246
+ my $LANGUAGE = shift(@_);
247
+ print "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
248
+ print "<KAF xml:lang=\"".$LANGUAGE."\" version=\"v1.opener\">\n";
249
+ print " <kafHeader>\n";
250
+ print " <fileDesc filename=\"".$filename."\" filetype=\"".$filetype."\" />\n";
251
+ print " <linguisticProcessors layer=\"text\">\n";
252
+ print " <lp name=\"opener-sentence-splitter-$LANGUAGE\" version=\"".$SENT_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
253
+ print " <lp name=\"opener-tokenizer-$LANGUAGE\" version=\"".$TOK_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
254
+ print " </linguisticProcessors>\n";
255
+ print " </kafHeader>\n";
256
+ }
257
+ #prints kaf xml fomat header whithout filedesc
258
+ sub print_kafheader_nofile {
259
+ my $timestamp = shift(@_);
260
+ print "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
261
+ print "<KAF xml:lang=\"".$LANGUAGE."\" version=\"v1.opener\">\n";
262
+ print " <kafHeader>\n";
263
+ print " <fileDesc />\n";
264
+ print " <linguisticProcessors layer=\"text\">\n";
265
+ print " <lp name=\"opener-sentence-splitter-$LANGUAGE\" version=\"".$SENT_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
266
+ print " <lp name=\"opener-tokenizer-$LANGUAGE\" version=\"".$TOK_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
267
+ print " </linguisticProcessors>\n";
268
+ print " </kafHeader>\n";
269
+ }
270
+
271
+ sub checkArguments {
272
+ my $argref = shift(@_);
273
+ my @arg = @ { $argref };
274
+ my $correct = 1;
275
+ if (scalar(@arg) > 0) {
276
+ for (my $i = 0; $i < scalar(@arg); $i++) {
277
+ if (lc($arg[$i]) eq "-l") {
278
+ if(scalar(@arg) > $i+1) {
279
+ if (lc($arg[$i+1]) ne "-t" && lc($arg[$i+1]) ne "-f" && lc($arg[$i+1]) ne "-l" && checkLanguage($arg[$i+1]) == 1) {
280
+ $LANGUAGE = $arg[$i+1];
281
+ }
282
+ else {
283
+ $correct = 0;
284
+ print STDERR "Error: language \"".$arg[$i+1]."\" not supported\n";
285
+ }
286
+ }
287
+ else {
288
+ $correct = 0;
289
+ print STDERR "Error: language don't specified\n";
290
+ }
291
+ }
292
+ elsif (lc($arg[$i]) eq "-f") {
293
+ if(scalar(@arg) > $i+1 && lc($arg[$i+1]) ne "-t" && lc($arg[$i+1]) ne "-f" && lc($arg[$i+1]) ne "-l") {
294
+ $FILE = $arg[$i+1];
295
+ }
296
+ else {
297
+ $correct = 0;
298
+ print STDERR "Error: file's name empty\n";
299
+ }
300
+ }
301
+ elsif (lc($arg[$i]) eq "-t") {
302
+ $NOTIMESTAMP = 1;
303
+ }
304
+ elsif (lc($arg[$i]) eq "--help") {
305
+ $HELP = 1;
306
+ }
307
+ }
308
+ if ($LANGUAGE ne "") {
309
+ return $correct;
310
+ }
311
+ else {
312
+ print STDERR "Error: language don't specified\n";
313
+ return 0;
314
+ }
315
+ }
316
+ else {
317
+ print STDERR "Error: language don't specified\n";
318
+ return 0
319
+ }
320
+ }
321
+
322
+ sub checkLanguage {
323
+ my $language = shift(@_);
324
+ if ($language eq "en") { return 1; }
325
+ elsif ($language eq "es") { return 1; }
326
+ elsif ($language eq "fr") { return 1; }
327
+ elsif ($language eq "it") { return 1; }
328
+ elsif ($language eq "de") { return 1; }
329
+ elsif ($language eq "nl") { return 1; }
330
+ else { return -1 }
331
+ }
332
+
333
+ sub displayHelp {
334
+ print STDERR "\nThis aplication reads a text from standard input in order to tokenize.\n";
335
+ print STDERR "Application arguments:\n";
336
+ print STDERR "-l, --language input text's language.\n";
337
+ print STDERR "-f, --filename (optional) file's name.\n";
338
+ print STDERR "-t, (optional) o use static timestamp at KAF header.\n";
339
+ print STDERR "--help, outputs aplication help.\n";
340
+ }
341
+
342
+ sub timestamp {
343
+ my $time = Time::Stamp::gmstamp();
344
+ return $time;
345
+ }
346
+
347
+ sub detect_encoding {
348
+ my $file = shift(@_);
349
+ my $enc;
350
+ open(FILE,$file);
351
+ binmode(FILE);
352
+ if(read(FILE,my $filestart, 500)) {
353
+ $enc = guess_encoding($filestart);
354
+ }
355
+ close(FILE);
356
+ if (ref($enc)) {
357
+ return $enc->name;
358
+ }
359
+ else {
360
+ return "utf8";
361
+ }
362
+ }
363
+