python-ucto 0.6.8__cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- python_ucto-0.6.8.data/data/sources/ucto_wrapper.pyx +184 -0
- python_ucto-0.6.8.dist-info/METADATA +20 -0
- python_ucto-0.6.8.dist-info/RECORD +18 -0
- python_ucto-0.6.8.dist-info/WHEEL +6 -0
- python_ucto-0.6.8.dist-info/top_level.txt +1 -0
- python_ucto.libs/libbz2-a273e504.so.1.0.6 +0 -0
- python_ucto.libs/libexttextcat-2-e165f333.0.so.0.0.0 +0 -0
- python_ucto.libs/libfolia-8d74cccd.so.21.0.0 +0 -0
- python_ucto.libs/libgomp-a34b3233.so.1.0.0 +0 -0
- python_ucto.libs/libicudata-cb3ba60c.so.50.2 +0 -0
- python_ucto.libs/libicui18n-97d01360.so.50.2 +0 -0
- python_ucto.libs/libicuio-d92a2ee9.so.50.2 +0 -0
- python_ucto.libs/libicuuc-1796a535.so.50.2 +0 -0
- python_ucto.libs/liblzma-004595ca.so.5.2.2 +0 -0
- python_ucto.libs/libticcutils-bcffb764.so.9.0.0 +0 -0
- python_ucto.libs/libucto-92c8806e.so.6.0.0 +0 -0
- python_ucto.libs/libxml2-174e59c1.so.2.9.14 +0 -0
- ucto.cpython-310-x86_64-linux-gnu.so +0 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
#embedsignature=True
|
|
2
|
+
#*****************************
|
|
3
|
+
# Python-ucto
|
|
4
|
+
# by Maarten van Gompel
|
|
5
|
+
# Centre for Language Studies
|
|
6
|
+
# Radboud University Nijmegen
|
|
7
|
+
#
|
|
8
|
+
# Licensed under GPLv3
|
|
9
|
+
#****************************/
|
|
10
|
+
|
|
11
|
+
from libcpp.string cimport string
|
|
12
|
+
from libcpp cimport bool
|
|
13
|
+
from libcpp.vector cimport vector
|
|
14
|
+
from cython.operator cimport dereference as deref, preincrement as inc
|
|
15
|
+
from cython import address
|
|
16
|
+
from libc.stdint cimport *
|
|
17
|
+
from libcpp.utility cimport pair
|
|
18
|
+
import os.path
|
|
19
|
+
import sys
|
|
20
|
+
cimport libfolia_classes
|
|
21
|
+
cimport ucto_classes
|
|
22
|
+
|
|
23
|
+
UCTODATAVERSION = "0.11"
|
|
24
|
+
|
|
25
|
+
class TokenRole:
|
|
26
|
+
NOROLE = 0
|
|
27
|
+
NOSPACE = 1
|
|
28
|
+
BEGINOFSENTENCE = 2
|
|
29
|
+
ENDOFSENTENCE = 4
|
|
30
|
+
NEWPARAGRAPH = 8
|
|
31
|
+
BEGINQUOTE = 16
|
|
32
|
+
ENDQUOTE = 32
|
|
33
|
+
TEMPENDOFSENTENCE = 64
|
|
34
|
+
|
|
35
|
+
class Token:
|
|
36
|
+
def __init__(self, text, tokentype, role):
|
|
37
|
+
self.text = text
|
|
38
|
+
self.tokentype = tokentype
|
|
39
|
+
self.role = role
|
|
40
|
+
|
|
41
|
+
def __str__(self):
|
|
42
|
+
return self.text
|
|
43
|
+
|
|
44
|
+
def type(self):
|
|
45
|
+
return self.tokentype
|
|
46
|
+
|
|
47
|
+
def isendofsentence(self):
|
|
48
|
+
return self.role & TokenRole.ENDOFSENTENCE
|
|
49
|
+
|
|
50
|
+
def iseos(self): #for lazy people
|
|
51
|
+
return self.role & TokenRole.ENDOFSENTENCE
|
|
52
|
+
|
|
53
|
+
def isbeginofsentence(self):
|
|
54
|
+
return self.role & TokenRole.BEGINOFSENTENCE
|
|
55
|
+
|
|
56
|
+
def isnewparagraph(self):
|
|
57
|
+
return self.role & TokenRole.NEWPARAGRAPH
|
|
58
|
+
|
|
59
|
+
def isbeginofquote(self):
|
|
60
|
+
return self.role & TokenRole.BEGINQUOTE
|
|
61
|
+
|
|
62
|
+
def isendofquote(self):
|
|
63
|
+
return self.role & TokenRole.ENDQUOTE
|
|
64
|
+
|
|
65
|
+
def nospace(self):
|
|
66
|
+
return self.role & TokenRole.NOSPACE
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
cdef class Tokenizer:
|
|
70
|
+
cdef ucto_classes.TokenizerClass tok
|
|
71
|
+
|
|
72
|
+
def __init__(self, filename, **kwargs):
|
|
73
|
+
self.tok.init(filename.encode('utf-8'))
|
|
74
|
+
for arg, value in kwargs.items():
|
|
75
|
+
if arg == 'lowercase':
|
|
76
|
+
self.tok.setLowercase(value is True)
|
|
77
|
+
elif arg == 'uppercase':
|
|
78
|
+
self.tok.setUppercase(value is True)
|
|
79
|
+
elif arg == 'sentencedetection':
|
|
80
|
+
sys.stderr.write("[python-ucto] Argument 'sentencedetection' is deprecated and has no effect, it is always enabled.\n")
|
|
81
|
+
elif arg == 'paragraphdetection':
|
|
82
|
+
self.tok.setParagraphDetection(value is True)
|
|
83
|
+
elif arg == 'quotedetection':
|
|
84
|
+
self.tok.setQuoteDetection(value is True)
|
|
85
|
+
elif arg == 'sentenceperlineinput':
|
|
86
|
+
self.tok.setSentencePerLineInput(value is True)
|
|
87
|
+
elif arg == 'sentenceperlineoutput':
|
|
88
|
+
self.tok.setSentencePerLineOutput(value is True)
|
|
89
|
+
elif arg == 'xmlinput' or arg == 'foliainput':
|
|
90
|
+
self.tok.setXMLInput(value is True)
|
|
91
|
+
elif arg == 'xmloutput' or arg == 'foliaoutput':
|
|
92
|
+
if 'docid' in kwargs:
|
|
93
|
+
docid = kwargs['docid']
|
|
94
|
+
else:
|
|
95
|
+
docid = "untitled"
|
|
96
|
+
self.tok.setXMLOutput(value is True, docid.encode('utf-8'))
|
|
97
|
+
elif arg == 'debug':
|
|
98
|
+
self.tok.setDebug(int(value))
|
|
99
|
+
elif arg == 'docid':
|
|
100
|
+
pass
|
|
101
|
+
else:
|
|
102
|
+
raise ValueError("No such keyword argument: " + arg)
|
|
103
|
+
|
|
104
|
+
def tokenize(self, str inputfile, str outputfile):
|
|
105
|
+
"""Run ucto from inputfile to outputfile (like command line tool)"""
|
|
106
|
+
self.tok.tokenize(inputfile.encode('utf-8'), outputfile.encode('utf-8'))
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def process(self, str line):
|
|
111
|
+
"""Feed text to the tokeniser. This needs not be a single line."""
|
|
112
|
+
self.tok.tokenizeLine(line.encode('utf-8'))
|
|
113
|
+
|
|
114
|
+
def sentences(self):
|
|
115
|
+
cdef vector[string] results = self.tok.getUTF8Sentences()
|
|
116
|
+
cdef vector[string].iterator it = results.begin()
|
|
117
|
+
cdef int sentencecount = len(results)
|
|
118
|
+
while it != results.end():
|
|
119
|
+
yield str(deref(it), 'utf-8').replace("<utt>",'')
|
|
120
|
+
inc(it)
|
|
121
|
+
|
|
122
|
+
def lowercase(self):
|
|
123
|
+
return self.tok.getLowercase()
|
|
124
|
+
|
|
125
|
+
def uppercase(self):
|
|
126
|
+
return self.tok.getLowercase()
|
|
127
|
+
|
|
128
|
+
def __iter__(self):
|
|
129
|
+
cdef vector[ucto_classes.Token] v
|
|
130
|
+
cdef vector[ucto_classes.Token].iterator it
|
|
131
|
+
while True:
|
|
132
|
+
v = self.tok.popSentence()
|
|
133
|
+
if v.empty():
|
|
134
|
+
break
|
|
135
|
+
it = v.begin()
|
|
136
|
+
while it != v.end():
|
|
137
|
+
tokentext = str(deref(it).texttostring(), 'utf-8')
|
|
138
|
+
tokentype = str(deref(it).typetostring(), 'utf-8')
|
|
139
|
+
role = deref(it).role
|
|
140
|
+
if self.lowercase():
|
|
141
|
+
tokentext = tokentext.lower()
|
|
142
|
+
elif self.uppercase():
|
|
143
|
+
tokentext = tokentext.upper()
|
|
144
|
+
yield Token(tokentext, tokentype, role)
|
|
145
|
+
inc(it)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def localpath():
|
|
149
|
+
xdg_config_dir = os.environ.get("XDG_CONFIG_HOME", os.path.join(os.environ.get("HOME",""), ".config"))
|
|
150
|
+
return os.environ.get("UCTODATAPATH", os.path.join(xdg_config_dir,"ucto") )
|
|
151
|
+
|
|
152
|
+
def installdata(targetdir=None, version=UCTODATAVERSION):
|
|
153
|
+
if targetdir is None:
|
|
154
|
+
targetdir = localpath()
|
|
155
|
+
else:
|
|
156
|
+
targetdir = os.path.join(targetdir,"ucto")
|
|
157
|
+
if os.path.exists(targetdir):
|
|
158
|
+
print(f"Uctodata configuration directory already exists: {targetdir}, refusing to overwrite, please remove it first if you want to install all data anew.", file=sys.stderr)
|
|
159
|
+
else:
|
|
160
|
+
tmpdir=os.environ.get("TMPDIR","/tmp")
|
|
161
|
+
if os.system(f"cd {tmpdir} && mkdir -p {targetdir} && wget -O uctodata.tar.gz https://github.com/LanguageMachines/uctodata/releases/download/v{version}/uctodata-{version}.tar.gz && tar -xzf uctodata.tar.gz && cd uctodata-{version} && mv config/* {targetdir}/ && cd .. && rm -Rf uctodata-{version} && rm -Rf uctodata.tar.gz") != 0:
|
|
162
|
+
raise Exception("Installation failed")
|
|
163
|
+
print(f"Installation of uctodata {version} complete", file=sys.stderr)
|
|
164
|
+
if os.path.isdir("/usr/share/libexttextcat"):
|
|
165
|
+
if os.system(f"cd {targetdir} && wget -O textcat.cfg https://raw.githubusercontent.com/LanguageMachines/ucto/master/config/textcat.cfg") != 0:
|
|
166
|
+
raise Exception("Installation of textcat.cfg failed")
|
|
167
|
+
else:
|
|
168
|
+
print("Language detection will not be available unless you install libexttextcat and rerun installdata()", file=sys.stderr)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: python-ucto
|
|
3
|
+
Version: 0.6.8
|
|
4
|
+
Summary: This is a Python binding to the tokenizer Ucto. Tokenisation is one of the first step in almost any Natural Language Processing task, yet it is not always as trivial a task as it appears to be. This binding makes the power of the ucto tokeniser available to Python. Ucto itself is a regular-expression based, extensible, and advanced tokeniser written in C++ (https://languagemachines.github.io/ucto).
|
|
5
|
+
Home-page: https://github.com/proycon/python-ucto
|
|
6
|
+
Author: Maarten van Gompel
|
|
7
|
+
Author-email: proycon@anaproy.nl
|
|
8
|
+
License: GPLv3
|
|
9
|
+
Keywords: tokenizer tokenization tokeniser tokenisation nlp computational_linguistics ucto
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
12
|
+
Classifier: Programming Language :: Cython
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Operating System :: POSIX
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: Science/Research
|
|
17
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
18
|
+
Requires: ucto (>=0.34)
|
|
19
|
+
Requires-Dist: Cython
|
|
20
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
ucto.cpython-310-x86_64-linux-gnu.so,sha256=IpUbCOhQZ2QqBJiVDwRemKGiNLep7dnEXSrTxTJnZvc,2317201
|
|
2
|
+
python_ucto-0.6.8.data/data/sources/ucto_wrapper.pyx,sha256=hsussZVPOXyikT9gtnBjR_aE-eWZhdQnvpopiM5XJRU,6392
|
|
3
|
+
python_ucto.libs/libicuuc-1796a535.so.50.2,sha256=ZPT2z0tpkV6chLxylDTz47Zaw3Gt15-izOGUkLni31M,1796193
|
|
4
|
+
python_ucto.libs/libgomp-a34b3233.so.1.0.0,sha256=On6uznIxkRvi-7Gz58tMtcLg-E4MK7c3OUcrWh_uyME,168193
|
|
5
|
+
python_ucto.libs/libicui18n-97d01360.so.50.2,sha256=yosCXXIz7LxTVHdhu51XJEa_DXTOHfo9REusho0I12c,2733217
|
|
6
|
+
python_ucto.libs/libxml2-174e59c1.so.2.9.14,sha256=YLx-sKMzrajESv4FC5XZlTDceuoSgvlLPCUpRjKeq0Q,6791169
|
|
7
|
+
python_ucto.libs/libfolia-8d74cccd.so.21.0.0,sha256=A-VRpMHjqPGVMsu1GXW4XLEhFsO0Q5P82H7i9eY45iM,27319017
|
|
8
|
+
python_ucto.libs/libexttextcat-2-e165f333.0.so.0.0.0,sha256=FDvo3bKVOKgMSv41BCGw5MYTr_tiWs-xjF4FEVYkq60,21617
|
|
9
|
+
python_ucto.libs/libucto-92c8806e.so.6.0.0,sha256=0wogFlgJgOZMJrCuJpKaf4WgFIqoaicHO39b4NHFlN8,6106761
|
|
10
|
+
python_ucto.libs/libicuio-d92a2ee9.so.50.2,sha256=mJCS6k0yQpjOqeKjoHihzEiDLGej2WpYhwCYsSPIhws,64017
|
|
11
|
+
python_ucto.libs/libticcutils-bcffb764.so.9.0.0,sha256=Ca3KZPlaNQhMn5MQ2crR7489t4vHZ4iUDsOvJAstMjY,8811377
|
|
12
|
+
python_ucto.libs/libicudata-cb3ba60c.so.50.2,sha256=mi096d4G6tWwhWUE-5Q8VeNA2WoUS8GRbeOhH-IBpSk,20787817
|
|
13
|
+
python_ucto.libs/liblzma-004595ca.so.5.2.2,sha256=8QVimqSNA2IHkdqYx4HnesWENyP2gf2uXocOhf7tetc,163473
|
|
14
|
+
python_ucto.libs/libbz2-a273e504.so.1.0.6,sha256=Ks7i35uwgx4aPZalUSQBwyfxeXGQ9BGdNInwVf00yb8,70993
|
|
15
|
+
python_ucto-0.6.8.dist-info/WHEEL,sha256=K6ps4aVUsEay-13wCJy8H3VKOTWNlXTmCBW4TF80uCE,151
|
|
16
|
+
python_ucto-0.6.8.dist-info/RECORD,,
|
|
17
|
+
python_ucto-0.6.8.dist-info/METADATA,sha256=L5sNmcy0FhzyB9J5Y15dZBHcQY-xuJH0UVCC0hYXoeU,1136
|
|
18
|
+
python_ucto-0.6.8.dist-info/top_level.txt,sha256=7zjSe25cKXiD3nFVCqHOi-tWjWL3N2nPFaVCsq3V6mw,5
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ucto
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|