opener-tokenizer-base 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +148 -0
- data/bin/tokenizer-base +5 -0
- data/bin/tokenizer-de +5 -0
- data/bin/tokenizer-en +5 -0
- data/bin/tokenizer-es +5 -0
- data/bin/tokenizer-fr +5 -0
- data/bin/tokenizer-it +5 -0
- data/bin/tokenizer-nl +5 -0
- data/core/lib/Data/OptList.pm +256 -0
- data/core/lib/Params/Util.pm +866 -0
- data/core/lib/Sub/Exporter.pm +1101 -0
- data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
- data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
- data/core/lib/Sub/Exporter/Util.pm +354 -0
- data/core/lib/Sub/Install.pm +329 -0
- data/core/lib/Time/Stamp.pm +808 -0
- data/core/load-prefixes.pl +43 -0
- data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
- data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
- data/core/split-sentences.pl +114 -0
- data/core/text-fixer.pl +169 -0
- data/core/tokenizer-cli.pl +363 -0
- data/core/tokenizer.pl +145 -0
- data/lib/opener/tokenizers/base.rb +84 -0
- data/lib/opener/tokenizers/base/version.rb +8 -0
- data/opener-tokenizer-base.gemspec +25 -0
- metadata +134 -0
@@ -0,0 +1,114 @@
|
|
1
|
+
#!/usr/bin/perl -w
|
2
|
+
|
3
|
+
# Based on Preprocessor written by Philipp Koehn
|
4
|
+
# Changed by aazpeitia (aazpeitia@vicomtech.org)
|
5
|
+
|
6
|
+
use strict;
|
7
|
+
|
8
|
+
my %NONBREAKING_PREFIX = ();
|
9
|
+
my $LANGUAGE;
|
10
|
+
|
11
|
+
sub init_sentence_splitter {
|
12
|
+
$LANGUAGE = shift(@_);
|
13
|
+
%NONBREAKING_PREFIX = %{ shift(@_) };
|
14
|
+
}
|
15
|
+
|
16
|
+
sub split_sentences {
|
17
|
+
|
18
|
+
my $input_text = shift(@_);
|
19
|
+
chomp($input_text);
|
20
|
+
my $text = "";
|
21
|
+
if (/^<.+>$/ || /^\s*$/) {
|
22
|
+
#time to process this block, we've hit a blank or <p>
|
23
|
+
#&do_it_for($text,$input_text);
|
24
|
+
#print "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
|
25
|
+
#$text = "";
|
26
|
+
$text .= &do_it_for($text,$input_text);
|
27
|
+
$text .= "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
|
28
|
+
}
|
29
|
+
else {
|
30
|
+
#append the text, with a space
|
31
|
+
$text .= $input_text. " ";
|
32
|
+
}
|
33
|
+
$text = &do_it_for($text,$input_text);
|
34
|
+
return split("\n", $text);
|
35
|
+
}
|
36
|
+
|
37
|
+
sub do_it_for {
|
38
|
+
my($text,$markup) = @_;
|
39
|
+
$text = &preprocess($text) if $text;
|
40
|
+
#print "$markup\n" if ($markup =~ /^<.+>$/);
|
41
|
+
return $text;
|
42
|
+
}
|
43
|
+
|
44
|
+
sub preprocess {
|
45
|
+
#this is one paragraph
|
46
|
+
my($text) = @_;
|
47
|
+
|
48
|
+
# clean up spaces at head and tail of each line as well as any double-spacing
|
49
|
+
$text =~ s/ +/ /g;
|
50
|
+
$text =~ s/\n /\n/g;
|
51
|
+
$text =~ s/ \n/\n/g;
|
52
|
+
$text =~ s/^ //g;
|
53
|
+
$text =~ s/ $//g;
|
54
|
+
|
55
|
+
#####add sentence breaks as needed#####
|
56
|
+
#non-period end of sentence markers (?!) followed by sentence starters.
|
57
|
+
|
58
|
+
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
59
|
+
#$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
|
60
|
+
$text =~ s/([?!]) +([\'\"\(\[\¿\¡\p{IsPi}]*[\w])/$1\n$2/g;
|
61
|
+
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
62
|
+
|
63
|
+
#multi-dots followed by sentence starters
|
64
|
+
$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g;
|
65
|
+
|
66
|
+
# add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case
|
67
|
+
$text =~ s/([?!\.][\ ]*[\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g;
|
68
|
+
|
69
|
+
# add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case
|
70
|
+
$text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
|
71
|
+
|
72
|
+
# special punctuation cases are covered. Check all remaining periods.
|
73
|
+
my $word;
|
74
|
+
my $i;
|
75
|
+
my @words = split(/ /,$text);
|
76
|
+
$text = "";
|
77
|
+
for ($i=0;$i<(scalar(@words)-1);$i++) {
|
78
|
+
if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
|
79
|
+
#check if $1 is a known honorific and $2 is empty, never break
|
80
|
+
my $prefix = $1;
|
81
|
+
my $starting_punct = $2;
|
82
|
+
if($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
|
83
|
+
|
84
|
+
#not breaking;
|
85
|
+
} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
|
86
|
+
#not breaking - upper case acronym
|
87
|
+
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
|
88
|
+
#the next word has a bunch of initial quotes, maybe a space, then either upper case or a number
|
89
|
+
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
|
90
|
+
#we always add a return for these unless we have a numeric non-breaker and a number start
|
91
|
+
}
|
92
|
+
|
93
|
+
}
|
94
|
+
$text = $text.$words[$i]." ";
|
95
|
+
}
|
96
|
+
|
97
|
+
#we stopped one token from the end to allow for easy look-ahead. Append it now.
|
98
|
+
$text = $text.$words[$i];
|
99
|
+
|
100
|
+
# clean up spaces at head and tail of each line as well as any double-spacing
|
101
|
+
$text =~ s/ +/ /g;
|
102
|
+
$text =~ s/\n /\n/g;
|
103
|
+
$text =~ s/ \n/\n/g;
|
104
|
+
$text =~ s/^ //g;
|
105
|
+
$text =~ s/ $//g;
|
106
|
+
|
107
|
+
#add trailing break
|
108
|
+
$text .= "\n" unless $text =~ /\n$/;
|
109
|
+
|
110
|
+
return $text;
|
111
|
+
|
112
|
+
}
|
113
|
+
|
114
|
+
1;
|
data/core/text-fixer.pl
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
#!/usr/bin/perl -w
|
2
|
+
|
3
|
+
# reads input text and fixes some mistakes
|
4
|
+
# developed by Andoni Azpeitia
|
5
|
+
|
6
|
+
use utf8;
|
7
|
+
|
8
|
+
my %NONBREAKING_PREFIX = ();
|
9
|
+
my $LANGUAGE;
|
10
|
+
|
11
|
+
my $START_QUOTES_REGEX = "“|‘|«|‹";
|
12
|
+
my $END_QUOTES_REGEX = "”|’|»|›";
|
13
|
+
|
14
|
+
sub init_text_fixer {
|
15
|
+
$LANGUAGE = shift(@_);
|
16
|
+
%NONBREAKING_PREFIX = %{ shift(@_) };
|
17
|
+
}
|
18
|
+
|
19
|
+
sub fix_text {
|
20
|
+
|
21
|
+
my($text) = shift(@_);
|
22
|
+
|
23
|
+
chomp($text);
|
24
|
+
|
25
|
+
#fix encoding
|
26
|
+
$text = &fix_encoding($text);
|
27
|
+
|
28
|
+
|
29
|
+
#word token method
|
30
|
+
my @words = split(/\s/,$text);
|
31
|
+
$text = "";
|
32
|
+
for (my $i=0;$i<(scalar(@words));$i++) {
|
33
|
+
my $word = $words[$i];
|
34
|
+
#Kumi Naidoo said: “bla bla bla.”Bla bla => Kumi Naidoo said: “bla bla bla”. Bla bla
|
35
|
+
|
36
|
+
if ( $word =~ /^(\S+)\.($END_QUOTES_REGEX)($START_QUOTES_REGEX*\p{IsUpper}\S*)$/ ) {
|
37
|
+
my $pre = $1;
|
38
|
+
my $quote = $2;
|
39
|
+
my $post = $3;
|
40
|
+
|
41
|
+
$word = $pre.$quote.". ".$post;
|
42
|
+
}
|
43
|
+
#to a "breach of trust." A German => to a "breach of trust". A German
|
44
|
+
elsif ( $word =~ /^(\S+)\.($END_QUOTES_REGEX)$/ ) {
|
45
|
+
my $pre = $1;
|
46
|
+
my $quote = $2;
|
47
|
+
if ( ($i<scalar(@words)-1 && $words[$i+1] =~ /^$START_QUOTES_REGEX*\p{IsUpper}\S*$/ )) {
|
48
|
+
$word = $pre.$quote.".";
|
49
|
+
}
|
50
|
+
elsif ($i==scalar(@words)-1) {
|
51
|
+
$word = $pre.$quote.".";
|
52
|
+
}
|
53
|
+
}
|
54
|
+
#OpeNER is amazing.OpeNER is cool. => OpeNER is amazing. OpeNER is cool.
|
55
|
+
elsif ( $word =~ /^(\S+)\.(\S+)$/) {
|
56
|
+
my $pre = $1;
|
57
|
+
my $post = $2;
|
58
|
+
if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($post =~ /^[\p{IsLower}]/) ) {
|
59
|
+
#no change
|
60
|
+
} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($post =~ /^[0-9]+/) ) {
|
61
|
+
#no change
|
62
|
+
} else {
|
63
|
+
$word = $pre.". ".$post;
|
64
|
+
}
|
65
|
+
}
|
66
|
+
#OpeNER is amazing .OpeNER is cool. => OpeNER is amazing. OpeNER is cool.
|
67
|
+
elsif ( $word =~ /^\.(\p{IsUpper}\S+)$/ ) {
|
68
|
+
my $post = $1;
|
69
|
+
if ( $i>0 && $words[$i-1] =~ /^(\S+)$/) {
|
70
|
+
$word = ". ".$post;
|
71
|
+
}
|
72
|
+
}
|
73
|
+
$text .= $word." ";
|
74
|
+
}
|
75
|
+
#freedoms." 'Outrageous'Although => freedoms". 'Outrageous' Although
|
76
|
+
#$text =~ s/(\")([^\"]+)(\. ?)(\")/$1$2$4$3/g;
|
77
|
+
#$text =~ s/(\')([^\']+)(\. ?)(\')/$1$2$4$3/g;
|
78
|
+
return $text;
|
79
|
+
}
|
80
|
+
|
81
|
+
sub fix_encoding {
|
82
|
+
|
83
|
+
my $text = shift(@_);
|
84
|
+
|
85
|
+
$text =~ s/â/'/g;
|
86
|
+
$text =~ s/Ã/À/g;
|
87
|
+
$text =~ s//“/g;
|
88
|
+
$text =~ s///g;
|
89
|
+
$text =~ s//\"/g;
|
90
|
+
############################################
|
91
|
+
############################################
|
92
|
+
$text =~ s/…/…/g; # elipsis
|
93
|
+
$text =~ s/â¦/…/g; # elipsis
|
94
|
+
$text =~ s/–/–/g; # long hyphen
|
95
|
+
$text =~ s/’/’/g; #curly apostrophe
|
96
|
+
$text =~ s/“/“/g; # curly open quote
|
97
|
+
$text =~ s/â€/”/g; # curly close quote
|
98
|
+
$text =~ s/»/»/g;
|
99
|
+
$text =~ s/«/«/g;
|
100
|
+
############################################
|
101
|
+
$text =~ s/á/á/g;
|
102
|
+
$text =~ s/é/é/g;
|
103
|
+
$text =~ s/Ã\*/í/g;
|
104
|
+
$text =~ s/ó/ó/g;
|
105
|
+
$text =~ s/ú/ú/g;
|
106
|
+
|
107
|
+
$text =~ s/Ã/Á/g;
|
108
|
+
$text =~ s/É/É/g;
|
109
|
+
$text =~ s/Ã/Í/g;
|
110
|
+
$text =~ s/Ó/Ó/g;
|
111
|
+
$text =~ s/Ú/Ú/g;
|
112
|
+
############################################
|
113
|
+
$text =~ s/ñ/ñ/g;
|
114
|
+
$text =~ s/ç/ç/g;
|
115
|
+
$text =~ s/Å“/œ/g;
|
116
|
+
|
117
|
+
$text =~ s/Ñ/Ñ/g;
|
118
|
+
$text =~ s/Ç/Ç/g;
|
119
|
+
$text =~ s/Å’/Œ/g;
|
120
|
+
############################################
|
121
|
+
$text =~ s/©/©/g;
|
122
|
+
$text =~ s/®/®/g;
|
123
|
+
$text =~ s/â„¢/™/g;
|
124
|
+
$text =~ s/Ø/Ø/g;
|
125
|
+
$text =~ s/ª/ª/g;
|
126
|
+
############################################
|
127
|
+
$text =~ s/ä/ä/g;
|
128
|
+
$text =~ s/ë/ë/g;
|
129
|
+
$text =~ s/ï/ï/g;
|
130
|
+
$text =~ s/ö/ö/g;
|
131
|
+
$text =~ s/ü/ü/g;
|
132
|
+
|
133
|
+
$text =~ s/Ä/Ä/g;
|
134
|
+
$text =~ s/Ë/Ë/g;
|
135
|
+
$text =~ s/Ã /Ï/g;
|
136
|
+
$text =~ s/Ö /Ö/g;
|
137
|
+
$text =~ s/Ãœ/Ü/g;
|
138
|
+
############################################
|
139
|
+
$text =~ s/Ã /à/g;
|
140
|
+
$text =~ s/è/è/g;
|
141
|
+
$text =~ s/ì/ì/g;
|
142
|
+
$text =~ s/ò/ò/g;
|
143
|
+
$text =~ s/ù/ù/g;
|
144
|
+
|
145
|
+
$text =~ s/À/À/g;
|
146
|
+
$text =~ s/È/È/g;
|
147
|
+
$text =~ s/ÃŒ/Ì/g;
|
148
|
+
$text =~ s/Ã’/Ò/g;
|
149
|
+
$text =~ s/Ù/Ù/g;
|
150
|
+
############################################
|
151
|
+
$text =~ s/â/â/g;
|
152
|
+
$text =~ s/ê/ê/g;
|
153
|
+
$text =~ s/î/î/g;
|
154
|
+
$text =~ s/ô/ô/g;
|
155
|
+
$text =~ s/û/û/g;
|
156
|
+
|
157
|
+
$text =~ s/Â/Â/g;
|
158
|
+
$text =~ s/Ê/Ê/g;
|
159
|
+
$text =~ s/ÃŽ/Î/g;
|
160
|
+
$text =~ s/Ô/Ô/g;
|
161
|
+
$text =~ s/Û/Û/g;
|
162
|
+
############################################
|
163
|
+
$text =~ s/Ã/E/g;
|
164
|
+
|
165
|
+
|
166
|
+
return $text;
|
167
|
+
}
|
168
|
+
|
169
|
+
1;
|
@@ -0,0 +1,363 @@
|
|
1
|
+
#!/usr/bin/perl -w
|
2
|
+
|
3
|
+
# Sample Tokenizer
|
4
|
+
# written by Josh Schroeder, based on code by Philipp Koehn
|
5
|
+
# changed by Haritz Arzelus (#2012/11/19) Aitor García and Andoni Azpeitia
|
6
|
+
|
7
|
+
use FindBin;
|
8
|
+
|
9
|
+
use lib "$FindBin::Bin/lib";
|
10
|
+
|
11
|
+
use Encode::Guess;
|
12
|
+
use Time::Stamp;
|
13
|
+
|
14
|
+
require "$FindBin::Bin"."/text-fixer.pl";
|
15
|
+
require "$FindBin::Bin"."/split-sentences.pl";
|
16
|
+
require "$FindBin::Bin"."/tokenizer.pl";
|
17
|
+
require "$FindBin::Bin"."/load-prefixes.pl";
|
18
|
+
|
19
|
+
no warnings;
|
20
|
+
use encoding 'utf8';
|
21
|
+
|
22
|
+
binmode(STDIN, ":utf8");
|
23
|
+
binmode(STDOUT, ":utf8");
|
24
|
+
binmode(STDERR, ":utf8");
|
25
|
+
|
26
|
+
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
27
|
+
#use FindBin qw($Bin);
|
28
|
+
#use strict;
|
29
|
+
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
30
|
+
#use Time::HiRes;
|
31
|
+
|
32
|
+
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
33
|
+
#my $mydir = "$Bin/nonbreaking_prefixes";
|
34
|
+
#changed by me (aitor) to point to the directory of the script, instead of current working directory
|
35
|
+
#my $mydir = "$FindBin::Bin"."/nonbreaking_prefixes";
|
36
|
+
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
37
|
+
|
38
|
+
#my $start = [ Time::HiRes::gettimeofday( ) ];
|
39
|
+
|
40
|
+
#while (@ARGV) {
|
41
|
+
# $_ = shift;
|
42
|
+
# /^-l$/ && ($language = shift, next);
|
43
|
+
# /^-q$/ && ($QUIET = 1, next);
|
44
|
+
# /^-h$/ && ($HELP = 1, next);
|
45
|
+
#}
|
46
|
+
|
47
|
+
#if (!$QUIET) {
|
48
|
+
# print STDERR "Tokenizer v3\n";
|
49
|
+
# print STDERR "Language: $language\n";
|
50
|
+
#}
|
51
|
+
|
52
|
+
#argument variables
|
53
|
+
my %NONBREAKING_PREFIX = ();
|
54
|
+
my $SENT_VERSION = "0.0.1";
|
55
|
+
my $TOK_VERSION = "1.0.1";
|
56
|
+
my $FILE = "";
|
57
|
+
my $LANGUAGE;
|
58
|
+
my $NOTIMESTAMP = 0;
|
59
|
+
my $HELP = 0;
|
60
|
+
|
61
|
+
my $SUBSTITUTE = "####";
|
62
|
+
|
63
|
+
if (checkArguments(\@ARGV) == 1) {
|
64
|
+
if ($HELP == 1) {
|
65
|
+
displayHelp();
|
66
|
+
exit 0;
|
67
|
+
}
|
68
|
+
}
|
69
|
+
else {
|
70
|
+
displayHelp();
|
71
|
+
exit -1;
|
72
|
+
}
|
73
|
+
|
74
|
+
# load nonbreaking prefixes and init both tokenizer and sentence splitter
|
75
|
+
%NONBREAKING_PREFIX = %{ &load_prefixes($LANGUAGE) };
|
76
|
+
|
77
|
+
&init_text_fixer($LANGUAGE, \%NONBREAKING_PREFIX);
|
78
|
+
&init_sentence_splitter($LANGUAGE, \%NONBREAKING_PREFIX);
|
79
|
+
&init_tokenizer($LANGUAGE, \%NONBREAKING_PREFIX);
|
80
|
+
|
81
|
+
|
82
|
+
# get timestamp
|
83
|
+
my $timestamp = "0000-00-00T00:00:00Z";
|
84
|
+
if ($NOTIMESTAMP == 0) {
|
85
|
+
$timestamp = timestamp();
|
86
|
+
}
|
87
|
+
|
88
|
+
# print kaf header
|
89
|
+
if ($FILE ne "") {
|
90
|
+
|
91
|
+
my $i = rindex($FILE, ".");
|
92
|
+
my $filename = substr($FILE, 0, $i);
|
93
|
+
my $filetype = uc(substr($FILE, $i+1, length($FILE)-length($filename)-1));
|
94
|
+
print_kafheader($filename, $filetype, $timestamp, $LANGUAGE);
|
95
|
+
}
|
96
|
+
else {
|
97
|
+
print_kafheader_nofile($timestamp);
|
98
|
+
}
|
99
|
+
print " <text>\n";
|
100
|
+
|
101
|
+
# process text
|
102
|
+
my $sent = 1;
|
103
|
+
my $para = 1;
|
104
|
+
my $counter = 0;
|
105
|
+
my $charcount = 0;
|
106
|
+
while(<STDIN>) {
|
107
|
+
|
108
|
+
if (/^<.+>$/ || /^\s*$/) {
|
109
|
+
#don't try to tokenize XML/HTML tag lines
|
110
|
+
chomp($_);
|
111
|
+
#print $_;
|
112
|
+
}
|
113
|
+
else {
|
114
|
+
|
115
|
+
#fix input text
|
116
|
+
my $text = &fix_text($_);
|
117
|
+
|
118
|
+
#split sentences
|
119
|
+
my @sentences = &split_sentences($text);
|
120
|
+
my $index = 0;
|
121
|
+
my $last_index = 0;
|
122
|
+
my $last_offset = -1;
|
123
|
+
my $j = 0;
|
124
|
+
foreach my $sentence (@sentences) {
|
125
|
+
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
|
126
|
+
#print &tokenize($_);
|
127
|
+
$tok = tokenize($sentence);
|
128
|
+
#tokenize some especial characters
|
129
|
+
$tok =~ s/([||
|||«|»])([^ ])/$1 $2/g;
|
130
|
+
$tok =~ s/([^ ])([||
|||«|»])/$1 $2/g;
|
131
|
+
#detokenize tokens with @
|
132
|
+
$tok =~ s/ @ /@/g;
|
133
|
+
#detokenize some tokens with '
|
134
|
+
$tok =~ s/([DLNO]) '/$1'/g;
|
135
|
+
#$tok =~ s/([DLNO])' /$1'/g; cambiado por Andoni Azpeitia Vicomtech L' armée => L'armée
|
136
|
+
$tok =~ s/o( )?'( )?clock/o'clock/g;
|
137
|
+
$tok =~ s/ ' ([0-9][0-9]s)/ '$1/g;
|
138
|
+
#detokenize some time formats
|
139
|
+
$tok =~ s/([0-9][0-9]*) ' ([0-9][0-9]*) "/$1'$2"/g;
|
140
|
+
$tok =~ s/([0-9][0-9]*) : ([0-9][0-9])/$1:$2/g;
|
141
|
+
#detokenize some height formats
|
142
|
+
$tok =~ s/([0-9][0-9]*) ' ([0-9][0-9])/$1'$2/g;
|
143
|
+
#tokenize two dashes
|
144
|
+
$tok =~ s/\-\-/ \-\-/g;
|
145
|
+
#correct ºC tokenization
|
146
|
+
$tok =~ s/([0-9])( )?º( )?C/$1 ºC/g;
|
147
|
+
$tok =~ s/ +/ /g;
|
148
|
+
#<<<<<<<<<<<<<<<
|
149
|
+
#changed by me (aitor) to format the output as a kind of dummy KAF format
|
150
|
+
chomp($tok);
|
151
|
+
|
152
|
+
@tokens = split(/ /, $tok);
|
153
|
+
|
154
|
+
my $i = 0;
|
155
|
+
foreach my $token (@tokens) {
|
156
|
+
|
157
|
+
$index = index($_, $token, $last_index);
|
158
|
+
|
159
|
+
#if token was substituted at tokenization, be careful
|
160
|
+
if ( $token eq "\"".$SUBSTITUTE ) {
|
161
|
+
$index = index($_, "'", $last_index);
|
162
|
+
$token = "\"";
|
163
|
+
}
|
164
|
+
elsif ( $token eq "\'".$SUBSTITUTE ) {
|
165
|
+
$index = index($_, "`", $last_index);
|
166
|
+
$token = "\'";
|
167
|
+
}
|
168
|
+
|
169
|
+
my $offset = $charcount + $index;
|
170
|
+
#if input text has been preprocesed and tokens has been moved ( 'hello.' => 'hello'.),
|
171
|
+
# offset of the "." char is at the left of "'" char not at the right
|
172
|
+
if ( $index==-1 ) {
|
173
|
+
$index = index($_, $token, $last_index-2);
|
174
|
+
$offset = $charcount + $index;
|
175
|
+
}
|
176
|
+
#make sure that found offset is not an offset of the same char at other position so,
|
177
|
+
# find offset of the next token and compare
|
178
|
+
# next token is at the same sentence
|
179
|
+
elsif ( $i<scalar(@tokens)-1 ) {
|
180
|
+
my $next_token_index = $charcount + index($_, $tokens[$i+1], length($token) + $last_index-1);
|
181
|
+
if ( $index==-1 || ($next_token_index > -1 && $offset > $next_token_index+1) ) {
|
182
|
+
$index = index($_, $token, $last_index-2);
|
183
|
+
$offset = $charcount + $index;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
#make sure that found offset is not an offset of the same char at other position so,
|
187
|
+
# find offset of the next token and compare
|
188
|
+
# next token is at next sentence
|
189
|
+
elsif ( $sent < scalar(@sentences)) {
|
190
|
+
my $next_sentence = $sentences[$j+1];
|
191
|
+
$next_sentence = &tokenize($next_sentence);
|
192
|
+
my @next_tokens = split(/ /, $next_sentence);
|
193
|
+
my $next_token = $next_tokens[0];
|
194
|
+
my $next_token_index = $charcount + index($_, $next_token, length($token) + $last_index-1);
|
195
|
+
if ( ($next_token_index > -1 && $offset > $next_token_index+1) ) {
|
196
|
+
$index = index($_, $token, $last_index-2);
|
197
|
+
$offset = $charcount + $index;
|
198
|
+
}
|
199
|
+
}
|
200
|
+
|
201
|
+
my $token_length = length($token);
|
202
|
+
&print_line(++$counter, $sent, $para, $offset, $token_length, $token);
|
203
|
+
|
204
|
+
$last_index = $index + $token_length;
|
205
|
+
$last_offset = $offset;
|
206
|
+
$i++;
|
207
|
+
}
|
208
|
+
|
209
|
+
#>>>>>>>>>>>>>>>
|
210
|
+
$j++;
|
211
|
+
$sent++;
|
212
|
+
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
|
213
|
+
}#foreach sentence
|
214
|
+
|
215
|
+
if (length($_) == 0) {
|
216
|
+
$charcount += 1;
|
217
|
+
}
|
218
|
+
else {
|
219
|
+
$charcount += length($_);
|
220
|
+
}
|
221
|
+
$para++;
|
222
|
+
}
|
223
|
+
}#while(<STDIN>)
|
224
|
+
print " </text>\n";
|
225
|
+
print "</KAF>\n";
|
226
|
+
|
227
|
+
|
228
|
+
#prints word form in kaf format
|
229
|
+
sub print_line {
|
230
|
+
|
231
|
+
my $wid=shift(@_);
|
232
|
+
my $sent=shift(@_);
|
233
|
+
my $para=shift(@_);
|
234
|
+
my $offset=shift(@_);
|
235
|
+
my $length=shift(@_);
|
236
|
+
my $token=shift(@_);
|
237
|
+
|
238
|
+
print " <wf wid=\"w".$wid."\" sent=\"".$sent."\" para=\"".$para."\" offset=\"".$offset."\" length=\"". $length."\"><![CDATA[".$token."]]></wf>\n";
|
239
|
+
}
|
240
|
+
|
241
|
+
#prints kaf xml fomat header
|
242
|
+
sub print_kafheader {
|
243
|
+
my $filename = shift(@_);
|
244
|
+
my $filetype = shift(@_);
|
245
|
+
my $timestamp = shift(@_);
|
246
|
+
my $LANGUAGE = shift(@_);
|
247
|
+
print "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
|
248
|
+
print "<KAF xml:lang=\"".$LANGUAGE."\" version=\"v1.opener\">\n";
|
249
|
+
print " <kafHeader>\n";
|
250
|
+
print " <fileDesc filename=\"".$filename."\" filetype=\"".$filetype."\" />\n";
|
251
|
+
print " <linguisticProcessors layer=\"text\">\n";
|
252
|
+
print " <lp name=\"opener-sentence-splitter-$LANGUAGE\" version=\"".$SENT_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
|
253
|
+
print " <lp name=\"opener-tokenizer-$LANGUAGE\" version=\"".$TOK_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
|
254
|
+
print " </linguisticProcessors>\n";
|
255
|
+
print " </kafHeader>\n";
|
256
|
+
}
|
257
|
+
#prints kaf xml fomat header whithout filedesc
|
258
|
+
sub print_kafheader_nofile {
|
259
|
+
my $timestamp = shift(@_);
|
260
|
+
print "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
|
261
|
+
print "<KAF xml:lang=\"".$LANGUAGE."\" version=\"v1.opener\">\n";
|
262
|
+
print " <kafHeader>\n";
|
263
|
+
print " <fileDesc />\n";
|
264
|
+
print " <linguisticProcessors layer=\"text\">\n";
|
265
|
+
print " <lp name=\"opener-sentence-splitter-$LANGUAGE\" version=\"".$SENT_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
|
266
|
+
print " <lp name=\"opener-tokenizer-$LANGUAGE\" version=\"".$TOK_VERSION."\" timestamp=\"".$timestamp."\"/>\n";
|
267
|
+
print " </linguisticProcessors>\n";
|
268
|
+
print " </kafHeader>\n";
|
269
|
+
}
|
270
|
+
|
271
|
+
sub checkArguments {
|
272
|
+
my $argref = shift(@_);
|
273
|
+
my @arg = @ { $argref };
|
274
|
+
my $correct = 1;
|
275
|
+
if (scalar(@arg) > 0) {
|
276
|
+
for (my $i = 0; $i < scalar(@arg); $i++) {
|
277
|
+
if (lc($arg[$i]) eq "-l") {
|
278
|
+
if(scalar(@arg) > $i+1) {
|
279
|
+
if (lc($arg[$i+1]) ne "-t" && lc($arg[$i+1]) ne "-f" && lc($arg[$i+1]) ne "-l" && checkLanguage($arg[$i+1]) == 1) {
|
280
|
+
$LANGUAGE = $arg[$i+1];
|
281
|
+
}
|
282
|
+
else {
|
283
|
+
$correct = 0;
|
284
|
+
print STDERR "Error: language \"".$arg[$i+1]."\" not supported\n";
|
285
|
+
}
|
286
|
+
}
|
287
|
+
else {
|
288
|
+
$correct = 0;
|
289
|
+
print STDERR "Error: language don't specified\n";
|
290
|
+
}
|
291
|
+
}
|
292
|
+
elsif (lc($arg[$i]) eq "-f") {
|
293
|
+
if(scalar(@arg) > $i+1 && lc($arg[$i+1]) ne "-t" && lc($arg[$i+1]) ne "-f" && lc($arg[$i+1]) ne "-l") {
|
294
|
+
$FILE = $arg[$i+1];
|
295
|
+
}
|
296
|
+
else {
|
297
|
+
$correct = 0;
|
298
|
+
print STDERR "Error: file's name empty\n";
|
299
|
+
}
|
300
|
+
}
|
301
|
+
elsif (lc($arg[$i]) eq "-t") {
|
302
|
+
$NOTIMESTAMP = 1;
|
303
|
+
}
|
304
|
+
elsif (lc($arg[$i]) eq "--help") {
|
305
|
+
$HELP = 1;
|
306
|
+
}
|
307
|
+
}
|
308
|
+
if ($LANGUAGE ne "") {
|
309
|
+
return $correct;
|
310
|
+
}
|
311
|
+
else {
|
312
|
+
print STDERR "Error: language don't specified\n";
|
313
|
+
return 0;
|
314
|
+
}
|
315
|
+
}
|
316
|
+
else {
|
317
|
+
print STDERR "Error: language don't specified\n";
|
318
|
+
return 0
|
319
|
+
}
|
320
|
+
}
|
321
|
+
|
322
|
+
sub checkLanguage {
|
323
|
+
my $language = shift(@_);
|
324
|
+
if ($language eq "en") { return 1; }
|
325
|
+
elsif ($language eq "es") { return 1; }
|
326
|
+
elsif ($language eq "fr") { return 1; }
|
327
|
+
elsif ($language eq "it") { return 1; }
|
328
|
+
elsif ($language eq "de") { return 1; }
|
329
|
+
elsif ($language eq "nl") { return 1; }
|
330
|
+
else { return -1 }
|
331
|
+
}
|
332
|
+
|
333
|
+
sub displayHelp {
|
334
|
+
print STDERR "\nThis aplication reads a text from standard input in order to tokenize.\n";
|
335
|
+
print STDERR "Application arguments:\n";
|
336
|
+
print STDERR "-l, --language input text's language.\n";
|
337
|
+
print STDERR "-f, --filename (optional) file's name.\n";
|
338
|
+
print STDERR "-t, (optional) o use static timestamp at KAF header.\n";
|
339
|
+
print STDERR "--help, outputs aplication help.\n";
|
340
|
+
}
|
341
|
+
|
342
|
+
sub timestamp {
|
343
|
+
my $time = Time::Stamp::gmstamp();
|
344
|
+
return $time;
|
345
|
+
}
|
346
|
+
|
347
|
+
sub detect_encoding {
|
348
|
+
my $file = shift(@_);
|
349
|
+
my $enc;
|
350
|
+
open(FILE,$file);
|
351
|
+
binmode(FILE);
|
352
|
+
if(read(FILE,my $filestart, 500)) {
|
353
|
+
$enc = guess_encoding($filestart);
|
354
|
+
}
|
355
|
+
close(FILE);
|
356
|
+
if (ref($enc)) {
|
357
|
+
return $enc->name;
|
358
|
+
}
|
359
|
+
else {
|
360
|
+
return "utf8";
|
361
|
+
}
|
362
|
+
}
|
363
|
+
|