SinaTools 0.1.38__py2.py3-none-any.whl → 0.1.40__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.40.data/data/sinatools/environment.yml +182 -0
- {SinaTools-0.1.38.dist-info → SinaTools-0.1.40.dist-info}/METADATA +13 -8
- {SinaTools-0.1.38.dist-info → SinaTools-0.1.40.dist-info}/RECORD +15 -15
- {SinaTools-0.1.38.dist-info → SinaTools-0.1.40.dist-info}/WHEEL +1 -1
- sinatools/VERSION +1 -1
- sinatools/environment.yml +161 -206
- sinatools/ner/__init__.py +7 -5
- sinatools/ner/data/datasets.py +7 -3
- sinatools/ner/data_format.py +24 -12
- sinatools/ner/helpers.py +49 -18
- sinatools/ner/trainers/BaseTrainer.py +2 -2
- SinaTools-0.1.38.data/data/sinatools/environment.yml +0 -227
- {SinaTools-0.1.38.dist-info → SinaTools-0.1.40.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.38.dist-info → SinaTools-0.1.40.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.38.dist-info → SinaTools-0.1.40.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.38.dist-info → SinaTools-0.1.40.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,182 @@
|
|
1
|
+
name: dev
|
2
|
+
channels:
|
3
|
+
- pytorch
|
4
|
+
- nvidia
|
5
|
+
- defaults
|
6
|
+
- https://repo.anaconda.com/pkgs/main
|
7
|
+
- https://repo.anaconda.com/pkgs/r
|
8
|
+
dependencies:
|
9
|
+
- _libgcc_mutex=0.1=main
|
10
|
+
- _openmp_mutex=5.1=1_gnu
|
11
|
+
- _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
|
12
|
+
- binutils_impl_linux-64=2.40=h5293946_0
|
13
|
+
- binutils_linux-64=2.40.0=hc2dff05_1
|
14
|
+
- blas=1.0=mkl
|
15
|
+
- brotli-python=1.0.9=py311h6a678d5_8
|
16
|
+
- bzip2=1.0.8=h5eee18b_6
|
17
|
+
- ca-certificates=2024.11.26=h06a4308_0
|
18
|
+
- certifi=2024.12.14=py311h06a4308_0
|
19
|
+
- charset-normalizer=3.3.2=pyhd3eb1b0_0
|
20
|
+
- cuda-cudart=12.4.127=0
|
21
|
+
- cuda-cupti=12.4.127=0
|
22
|
+
- cuda-libraries=12.4.1=0
|
23
|
+
- cuda-nvrtc=12.4.127=0
|
24
|
+
- cuda-nvtx=12.4.127=0
|
25
|
+
- cuda-opencl=12.4.127=0
|
26
|
+
- cuda-runtime=12.4.1=0
|
27
|
+
- cuda-version=11.7=h6a555f7_3
|
28
|
+
- cudatoolkit=11.7.0=hd8887f6_10
|
29
|
+
- ffmpeg=4.3=hf484d3e_0
|
30
|
+
- filelock=3.13.1=py311h06a4308_0
|
31
|
+
- freetype=2.12.1=h4a9f257_0
|
32
|
+
- fsspec=2024.6.1=py311h06a4308_0
|
33
|
+
- gcc_impl_linux-64=11.2.0=h1234567_1
|
34
|
+
- gcc_linux-64=11.2.0=h5c386dc_1
|
35
|
+
- giflib=5.2.2=h5eee18b_0
|
36
|
+
- gmp=6.2.1=h295c915_3
|
37
|
+
- gmpy2=2.1.2=py311hc9b5ff0_0
|
38
|
+
- gnutls=3.6.15=he1e5248_0
|
39
|
+
- gxx_impl_linux-64=11.2.0=h1234567_1
|
40
|
+
- gxx_linux-64=11.2.0=hc2dff05_1
|
41
|
+
- idna=3.7=py311h06a4308_0
|
42
|
+
- intel-openmp=2023.1.0=hdb19cb5_46306
|
43
|
+
- jinja2=3.1.4=py311h06a4308_1
|
44
|
+
- jpeg=9e=h5eee18b_3
|
45
|
+
- kernel-headers_linux-64=3.10.0=h57e8cba_10
|
46
|
+
- lame=3.100=h7b6447c_0
|
47
|
+
- lcms2=2.16=hb9589c4_0
|
48
|
+
- ld_impl_linux-64=2.40=h12ee557_0
|
49
|
+
- lerc=4.0.0=h6a678d5_0
|
50
|
+
- libabseil=20240116.2=cxx17_h6a678d5_0
|
51
|
+
- libcublas=12.4.5.8=0
|
52
|
+
- libcufft=11.2.1.3=0
|
53
|
+
- libcufile=1.9.1.3=0
|
54
|
+
- libcurand=10.3.5.147=0
|
55
|
+
- libcusolver=11.6.1.9=0
|
56
|
+
- libcusparse=12.3.1.170=0
|
57
|
+
- libdeflate=1.22=h5eee18b_0
|
58
|
+
- libffi=3.4.4=h6a678d5_1
|
59
|
+
- libgcc-devel_linux-64=11.2.0=h1234567_1
|
60
|
+
- libgcc-ng=11.2.0=h1234567_1
|
61
|
+
- libgomp=11.2.0=h1234567_1
|
62
|
+
- libiconv=1.16=h5eee18b_3
|
63
|
+
- libidn2=2.3.4=h5eee18b_0
|
64
|
+
- libjpeg-turbo=2.0.0=h9bf148f_0
|
65
|
+
- libnpp=12.2.5.30=0
|
66
|
+
- libnvfatbin=12.4.127=0
|
67
|
+
- libnvjitlink=12.4.127=0
|
68
|
+
- libnvjpeg=12.3.1.117=0
|
69
|
+
- libpng=1.6.39=h5eee18b_0
|
70
|
+
- libprotobuf=4.25.3=he621ea3_0
|
71
|
+
- libstdcxx-devel_linux-64=11.2.0=h1234567_1
|
72
|
+
- libstdcxx-ng=11.2.0=h1234567_1
|
73
|
+
- libtasn1=4.19.0=h5eee18b_0
|
74
|
+
- libtiff=4.5.1=hffd6297_1
|
75
|
+
- libunistring=0.9.10=h27cfd23_0
|
76
|
+
- libuuid=1.41.5=h5eee18b_0
|
77
|
+
- libwebp=1.3.2=h11a3e52_0
|
78
|
+
- libwebp-base=1.3.2=h5eee18b_1
|
79
|
+
- llvm-openmp=14.0.6=h9e868ea_0
|
80
|
+
- lz4-c=1.9.4=h6a678d5_1
|
81
|
+
- markupsafe=2.1.3=py311h5eee18b_0
|
82
|
+
- mkl=2023.1.0=h213fc3f_46344
|
83
|
+
- mkl-service=2.4.0=py311h5eee18b_1
|
84
|
+
- mkl_fft=1.3.11=py311h5eee18b_0
|
85
|
+
- mkl_random=1.2.8=py311ha02d727_0
|
86
|
+
- mpc=1.1.0=h10f8cd9_1
|
87
|
+
- mpfr=4.0.2=hb69a4c5_1
|
88
|
+
- mpmath=1.3.0=py311h06a4308_0
|
89
|
+
- ncurses=6.4=h6a678d5_0
|
90
|
+
- nettle=3.7.3=hbbd107a_1
|
91
|
+
- networkx=3.2.1=py311h06a4308_0
|
92
|
+
- numpy=2.0.1=py311h08b1b3b_1
|
93
|
+
- numpy-base=2.0.1=py311hf175353_1
|
94
|
+
- openh264=2.1.1=h4ff587b_0
|
95
|
+
- openjpeg=2.5.2=he7f1fd0_0
|
96
|
+
- openssl=3.0.15=h5eee18b_0
|
97
|
+
- pillow=11.0.0=py311hcea889d_1
|
98
|
+
- pip=24.2=py311h06a4308_0
|
99
|
+
- pysocks=1.7.1=py311h06a4308_0
|
100
|
+
- python=3.11.11=he870216_0
|
101
|
+
- pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0
|
102
|
+
- pytorch-cuda=12.4=hc786d27_7
|
103
|
+
- pytorch-mutex=1.0=cuda
|
104
|
+
- pyyaml=6.0.2=py311h5eee18b_0
|
105
|
+
- readline=8.2=h5eee18b_0
|
106
|
+
- requests=2.32.3=py311h06a4308_1
|
107
|
+
- setuptools=75.1.0=py311h06a4308_0
|
108
|
+
- sqlite=3.45.3=h5eee18b_0
|
109
|
+
- sysroot_linux-64=2.17=h57e8cba_10
|
110
|
+
- tbb=2021.8.0=hdb19cb5_0
|
111
|
+
- tk=8.6.14=h39e8969_0
|
112
|
+
- torchaudio=2.5.1=py311_cu124
|
113
|
+
- torchtriton=3.1.0=py311
|
114
|
+
- torchvision=0.20.1=py311_cu124
|
115
|
+
- typing_extensions=4.12.2=py311h06a4308_0
|
116
|
+
- urllib3=2.2.3=py311h06a4308_0
|
117
|
+
- wheel=0.44.0=py311h06a4308_0
|
118
|
+
- xz=5.4.6=h5eee18b_1
|
119
|
+
- yaml=0.2.5=h7b6447c_0
|
120
|
+
- zlib=1.2.13=h5eee18b_1
|
121
|
+
- zstd=1.5.6=hc292b87_0
|
122
|
+
- pip:
|
123
|
+
- absl-py==2.1.0
|
124
|
+
- accelerate==1.2.1
|
125
|
+
- aiohappyeyeballs==2.4.4
|
126
|
+
- aiohttp==3.11.11
|
127
|
+
- aiosignal==1.3.2
|
128
|
+
- annotated-types==0.7.0
|
129
|
+
- attrs==24.3.0
|
130
|
+
- datasets==3.2.0
|
131
|
+
- deepspeed==0.16.2
|
132
|
+
- dill==0.3.8
|
133
|
+
- einops==0.8.0
|
134
|
+
- flash-attn==2.7.2.post1
|
135
|
+
- frozenlist==1.5.0
|
136
|
+
- grpcio==1.70.0
|
137
|
+
- hjson==3.1.0
|
138
|
+
- huggingface-hub==0.27.0
|
139
|
+
- joblib==1.4.2
|
140
|
+
- markdown==3.7
|
141
|
+
- markdown-it-py==3.0.0
|
142
|
+
- mdurl==0.1.2
|
143
|
+
- mpi4py==4.0.1
|
144
|
+
- msgpack==1.1.0
|
145
|
+
- multidict==6.1.0
|
146
|
+
- multiprocess==0.70.16
|
147
|
+
- natsort==8.4.0
|
148
|
+
- ninja==1.11.1.3
|
149
|
+
- nvidia-ml-py==12.560.30
|
150
|
+
- packaging==24.2
|
151
|
+
- pandas==2.2.3
|
152
|
+
- peft==0.14.0
|
153
|
+
- propcache==0.2.1
|
154
|
+
- protobuf==6.30.0
|
155
|
+
- psutil==6.1.1
|
156
|
+
- py-cpuinfo==9.0.0
|
157
|
+
- pyarrow==18.1.0
|
158
|
+
- pydantic==2.10.4
|
159
|
+
- pydantic-core==2.27.2
|
160
|
+
- pygments==2.18.0
|
161
|
+
- python-dateutil==2.9.0.post0
|
162
|
+
- pytz==2024.2
|
163
|
+
- regex==2024.11.6
|
164
|
+
- rich==13.9.4
|
165
|
+
- safetensors==0.4.5
|
166
|
+
- scikit-learn==1.6.1
|
167
|
+
- scipy==1.15.2
|
168
|
+
- seqeval==1.2.2
|
169
|
+
- six==1.17.0
|
170
|
+
- sympy==1.13.1
|
171
|
+
- tensorboard==2.19.0
|
172
|
+
- tensorboard-data-server==0.7.2
|
173
|
+
- threadpoolctl==3.5.0
|
174
|
+
- tokenizers==0.21.0
|
175
|
+
- tqdm==4.67.1
|
176
|
+
- transformers==4.47.1
|
177
|
+
- trl==0.12.0
|
178
|
+
- tzdata==2024.2
|
179
|
+
- werkzeug==3.1.3
|
180
|
+
- xxhash==3.5.0
|
181
|
+
- yarl==1.18.3
|
182
|
+
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.40
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -13,12 +13,17 @@ Requires-Dist: farasapy
|
|
13
13
|
Requires-Dist: tqdm
|
14
14
|
Requires-Dist: requests
|
15
15
|
Requires-Dist: pathlib
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist:
|
18
|
-
Requires-Dist:
|
19
|
-
Requires-Dist:
|
20
|
-
|
21
|
-
|
16
|
+
Requires-Dist: transformers==4.47.1
|
17
|
+
Requires-Dist: torchvision==0.20.1
|
18
|
+
Requires-Dist: seqeval==1.2.2
|
19
|
+
Requires-Dist: natsort==7.1.1
|
20
|
+
Dynamic: description
|
21
|
+
Dynamic: description-content-type
|
22
|
+
Dynamic: home-page
|
23
|
+
Dynamic: keywords
|
24
|
+
Dynamic: license
|
25
|
+
Dynamic: requires-dist
|
26
|
+
Dynamic: summary
|
22
27
|
|
23
28
|
SinaTools
|
24
29
|
======================
|
@@ -1,7 +1,7 @@
|
|
1
|
-
SinaTools-0.1.
|
2
|
-
sinatools/VERSION,sha256=
|
1
|
+
SinaTools-0.1.40.data/data/sinatools/environment.yml,sha256=i0UFZc-vwU9ZwnI8hBdz7vi-x22vG-HR8ojWBUAOkno,5422
|
2
|
+
sinatools/VERSION,sha256=X8RiPEX_AmrlLAnj8YcBMQpdMfq9ZIPos6V7xEw6f8o,6
|
3
3
|
sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
|
4
|
-
sinatools/environment.yml,sha256=
|
4
|
+
sinatools/environment.yml,sha256=i0UFZc-vwU9ZwnI8hBdz7vi-x22vG-HR8ojWBUAOkno,5422
|
5
5
|
sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
|
6
6
|
sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
|
7
7
|
sinatools/CLI/DataDownload/download_files.py,sha256=EezvbukR3pZ8s6mGZnzTcjsbo3CBDlC0g6KhJWlYp1w,2686
|
@@ -76,21 +76,21 @@ sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB
|
|
76
76
|
sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma4WUm-okDps,1346
|
77
77
|
sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
|
78
78
|
sinatools/morphology/morph_analyzer.py,sha256=JOH2UWKNQWo5UzpWNzP9R1D3B3qLSogIiMp8n0N_56o,7177
|
79
|
-
sinatools/ner/__init__.py,sha256=
|
80
|
-
sinatools/ner/data_format.py,sha256=
|
79
|
+
sinatools/ner/__init__.py,sha256=bBXwAShP9vVvRD1WsmJtSOueboxRGJftYdKu24cMS8E,1181
|
80
|
+
sinatools/ner/data_format.py,sha256=VmFshZbEPOsWxsb4tgSkwvbM1k7yCce4kmtPkCiWgwM,4513
|
81
81
|
sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
|
82
82
|
sinatools/ner/entity_extractor.py,sha256=O2epRwRFUUcQs3SnFIYHVBI4zVhr8hRcj0XJYeby4ts,3588
|
83
|
-
sinatools/ner/helpers.py,sha256=
|
83
|
+
sinatools/ner/helpers.py,sha256=sX6ezVbuVQxk_xJqZwhUzJVFVuVmFGmei_kd6r3sPHE,3652
|
84
84
|
sinatools/ner/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
|
85
85
|
sinatools/ner/transforms.py,sha256=vti3mDdi-IRP8i0aTQ37QqpPlP9hdMmJ6_bAMa0uL-s,4871
|
86
86
|
sinatools/ner/data/__init__.py,sha256=W0C1ge_XxTfmdEGz0hkclz57aLI5VFS5t6BjByCfkFk,57
|
87
|
-
sinatools/ner/data/datasets.py,sha256=
|
87
|
+
sinatools/ner/data/datasets.py,sha256=_uUlvBAhnTtPwKLj0wIbmB04VCBidfwffxKorLGHq_g,5134
|
88
88
|
sinatools/ner/data/transforms.py,sha256=URMz1dHzkHjgUGAkDOenCWvQThO1ha8XeQVjoLL9RXM,4874
|
89
89
|
sinatools/ner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0,608
|
90
90
|
sinatools/ner/nn/BertNestedTagger.py,sha256=_fwAn1kiKmXe6m5y16Ipty3kvXIEFEmiUq74Ad1818U,1219
|
91
91
|
sinatools/ner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
|
92
92
|
sinatools/ner/nn/__init__.py,sha256=UgQD_XLNzQGBNSYc_Bw1aRJZjq4PJsnMT1iZwnJemqE,170
|
93
|
-
sinatools/ner/trainers/BaseTrainer.py,sha256=
|
93
|
+
sinatools/ner/trainers/BaseTrainer.py,sha256=Uar8HxtgBXCVhKa85sEN622d9P7JiFBcWfs46uRG4aA,4068
|
94
94
|
sinatools/ner/trainers/BertNestedTrainer.py,sha256=iJOah69tXZsAXBimqP0odEsk8SPX4A355riePzW2BFs,8632
|
95
95
|
sinatools/ner/trainers/BertTrainer.py,sha256=BtttsrHPolmK3eRDqrgVUuv6lVMuImIeskxhi02Q-44,6596
|
96
96
|
sinatools/ner/trainers/__init__.py,sha256=Xnbi_M4KKJRqV7FJe1vklyT0nEW2Q2obxgcWkbR0ZbA,190
|
@@ -114,10 +114,10 @@ sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
|
|
114
114
|
sinatools/wsd/disambiguator.py,sha256=h-3idc5rPPbMDSE_QVJAsEVkDHwzYY3L2SEPNXIdOcc,20104
|
115
115
|
sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
|
116
116
|
sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
|
117
|
-
SinaTools-0.1.
|
118
|
-
SinaTools-0.1.
|
119
|
-
SinaTools-0.1.
|
120
|
-
SinaTools-0.1.
|
121
|
-
SinaTools-0.1.
|
122
|
-
SinaTools-0.1.
|
123
|
-
SinaTools-0.1.
|
117
|
+
SinaTools-0.1.40.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
|
118
|
+
SinaTools-0.1.40.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
|
119
|
+
SinaTools-0.1.40.dist-info/METADATA,sha256=woJwOAHIJuAlctnXXt8oiYGoDDcV42OrjmWOE3Q6ycI,3410
|
120
|
+
SinaTools-0.1.40.dist-info/WHEEL,sha256=9Hm2OB-j1QcCUq9Jguht7ayGIIZBRTdOXD1qg9cCgPM,109
|
121
|
+
SinaTools-0.1.40.dist-info/entry_points.txt,sha256=_CsRKM_tSCWV5hefBNUsWf9_6DrJnzFlxeAo1wm5XqY,1302
|
122
|
+
SinaTools-0.1.40.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
|
123
|
+
SinaTools-0.1.40.dist-info/RECORD,,
|
sinatools/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.40
|
sinatools/environment.yml
CHANGED
@@ -1,227 +1,182 @@
|
|
1
|
-
name:
|
1
|
+
name: dev
|
2
2
|
channels:
|
3
|
-
- anaconda
|
4
3
|
- pytorch
|
5
4
|
- nvidia
|
6
|
-
- conda-forge
|
7
5
|
- defaults
|
6
|
+
- https://repo.anaconda.com/pkgs/main
|
7
|
+
- https://repo.anaconda.com/pkgs/r
|
8
8
|
dependencies:
|
9
9
|
- _libgcc_mutex=0.1=main
|
10
10
|
- _openmp_mutex=5.1=1_gnu
|
11
|
-
-
|
12
|
-
-
|
13
|
-
-
|
14
|
-
- aiosignal=1.2.0=pyhd8ed1ab_0
|
15
|
-
- arrow-cpp=8.0.0=py310h3098874_0
|
16
|
-
- async-timeout=4.0.2=py310h06a4308_0
|
17
|
-
- attrs=22.1.0=pyh71513ae_1
|
18
|
-
- aws-c-common=0.4.57=he6710b0_1
|
19
|
-
- aws-c-event-stream=0.1.6=h2531618_5
|
20
|
-
- aws-checksums=0.1.9=he6710b0_0
|
21
|
-
- aws-sdk-cpp=1.8.185=hce553d0_0
|
11
|
+
- _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
|
12
|
+
- binutils_impl_linux-64=2.40=h5293946_0
|
13
|
+
- binutils_linux-64=2.40.0=hc2dff05_1
|
22
14
|
- blas=1.0=mkl
|
23
|
-
-
|
24
|
-
-
|
25
|
-
-
|
26
|
-
-
|
27
|
-
-
|
28
|
-
-
|
29
|
-
-
|
30
|
-
-
|
31
|
-
-
|
32
|
-
-
|
33
|
-
-
|
34
|
-
-
|
35
|
-
-
|
36
|
-
-
|
37
|
-
- cryptography=38.0.1=py310h9ce1e76_0
|
38
|
-
- cuda=11.7.1=0
|
39
|
-
- cuda-cccl=11.7.91=0
|
40
|
-
- cuda-command-line-tools=11.7.1=0
|
41
|
-
- cuda-compiler=11.7.1=0
|
42
|
-
- cuda-cudart=11.7.99=0
|
43
|
-
- cuda-cudart-dev=11.7.99=0
|
44
|
-
- cuda-cuobjdump=11.7.91=0
|
45
|
-
- cuda-cupti=11.7.101=0
|
46
|
-
- cuda-cuxxfilt=11.7.91=0
|
47
|
-
- cuda-demo-suite=11.8.86=0
|
48
|
-
- cuda-documentation=11.8.86=0
|
49
|
-
- cuda-driver-dev=11.7.99=0
|
50
|
-
- cuda-gdb=11.8.86=0
|
51
|
-
- cuda-libraries=11.7.1=0
|
52
|
-
- cuda-libraries-dev=11.7.1=0
|
53
|
-
- cuda-memcheck=11.8.86=0
|
54
|
-
- cuda-nsight=11.8.86=0
|
55
|
-
- cuda-nsight-compute=11.8.0=0
|
56
|
-
- cuda-nvcc=11.7.99=0
|
57
|
-
- cuda-nvdisasm=11.8.86=0
|
58
|
-
- cuda-nvml-dev=11.7.91=0
|
59
|
-
- cuda-nvprof=11.8.87=0
|
60
|
-
- cuda-nvprune=11.7.91=0
|
61
|
-
- cuda-nvrtc=11.7.99=0
|
62
|
-
- cuda-nvrtc-dev=11.7.99=0
|
63
|
-
- cuda-nvtx=11.7.91=0
|
64
|
-
- cuda-nvvp=11.8.87=0
|
65
|
-
- cuda-runtime=11.7.1=0
|
66
|
-
- cuda-sanitizer-api=11.8.86=0
|
67
|
-
- cuda-toolkit=11.7.1=0
|
68
|
-
- cuda-tools=11.7.1=0
|
69
|
-
- cuda-visual-tools=11.7.1=0
|
70
|
-
- dataclasses=0.8=pyhc8e2a94_3
|
71
|
-
- datasets=2.6.1=pyhd8ed1ab_0
|
72
|
-
- dill=0.3.5.1=pyhd8ed1ab_0
|
15
|
+
- brotli-python=1.0.9=py311h6a678d5_8
|
16
|
+
- bzip2=1.0.8=h5eee18b_6
|
17
|
+
- ca-certificates=2024.11.26=h06a4308_0
|
18
|
+
- certifi=2024.12.14=py311h06a4308_0
|
19
|
+
- charset-normalizer=3.3.2=pyhd3eb1b0_0
|
20
|
+
- cuda-cudart=12.4.127=0
|
21
|
+
- cuda-cupti=12.4.127=0
|
22
|
+
- cuda-libraries=12.4.1=0
|
23
|
+
- cuda-nvrtc=12.4.127=0
|
24
|
+
- cuda-nvtx=12.4.127=0
|
25
|
+
- cuda-opencl=12.4.127=0
|
26
|
+
- cuda-runtime=12.4.1=0
|
27
|
+
- cuda-version=11.7=h6a555f7_3
|
28
|
+
- cudatoolkit=11.7.0=hd8887f6_10
|
73
29
|
- ffmpeg=4.3=hf484d3e_0
|
74
|
-
-
|
75
|
-
- filelock=3.8.0=pyhd8ed1ab_0
|
30
|
+
- filelock=3.13.1=py311h06a4308_0
|
76
31
|
- freetype=2.12.1=h4a9f257_0
|
77
|
-
-
|
78
|
-
-
|
79
|
-
-
|
80
|
-
-
|
81
|
-
- giflib=5.2.1=h7b6447c_0
|
82
|
-
- glog=0.6.0=h6f12383_0
|
32
|
+
- fsspec=2024.6.1=py311h06a4308_0
|
33
|
+
- gcc_impl_linux-64=11.2.0=h1234567_1
|
34
|
+
- gcc_linux-64=11.2.0=h5c386dc_1
|
35
|
+
- giflib=5.2.2=h5eee18b_0
|
83
36
|
- gmp=6.2.1=h295c915_3
|
37
|
+
- gmpy2=2.1.2=py311hc9b5ff0_0
|
84
38
|
- gnutls=3.6.15=he1e5248_0
|
85
|
-
|
86
|
-
|
87
|
-
-
|
88
|
-
-
|
89
|
-
-
|
90
|
-
-
|
91
|
-
-
|
92
|
-
- importlib-metadata=5.0.0=pyha770c72_1
|
93
|
-
- importlib_metadata=5.0.0=hd8ed1ab_1
|
94
|
-
- intel-openmp=2021.4.0=h06a4308_3561
|
95
|
-
- joblib=1.2.0=pyhd8ed1ab_0
|
96
|
-
- jpeg=9e=h7f8727e_0
|
97
|
-
- keyutils=1.6.1=h166bdaf_0
|
98
|
-
- krb5=1.19.3=h3790be6_0
|
39
|
+
- gxx_impl_linux-64=11.2.0=h1234567_1
|
40
|
+
- gxx_linux-64=11.2.0=hc2dff05_1
|
41
|
+
- idna=3.7=py311h06a4308_0
|
42
|
+
- intel-openmp=2023.1.0=hdb19cb5_46306
|
43
|
+
- jinja2=3.1.4=py311h06a4308_1
|
44
|
+
- jpeg=9e=h5eee18b_3
|
45
|
+
- kernel-headers_linux-64=3.10.0=h57e8cba_10
|
99
46
|
- lame=3.100=h7b6447c_0
|
100
|
-
- lcms2=2.
|
101
|
-
- ld_impl_linux-64=2.
|
102
|
-
- lerc=
|
103
|
-
-
|
104
|
-
-
|
105
|
-
-
|
106
|
-
-
|
107
|
-
-
|
108
|
-
-
|
109
|
-
-
|
110
|
-
-
|
111
|
-
-
|
112
|
-
-
|
113
|
-
- libcurl=7.85.0=h91b91d3_0
|
114
|
-
- libcusolver=11.4.1.48=0
|
115
|
-
- libcusolver-dev=11.4.1.48=0
|
116
|
-
- libcusparse=11.7.5.86=0
|
117
|
-
- libcusparse-dev=11.7.5.86=0
|
118
|
-
- libdeflate=1.8=h7f8727e_5
|
119
|
-
- libedit=3.1.20191231=he28a2e2_2
|
120
|
-
- libev=4.33=h516909a_1
|
121
|
-
- libevent=2.1.10=h9b69904_4
|
122
|
-
- libffi=3.3=he6710b0_2
|
47
|
+
- lcms2=2.16=hb9589c4_0
|
48
|
+
- ld_impl_linux-64=2.40=h12ee557_0
|
49
|
+
- lerc=4.0.0=h6a678d5_0
|
50
|
+
- libabseil=20240116.2=cxx17_h6a678d5_0
|
51
|
+
- libcublas=12.4.5.8=0
|
52
|
+
- libcufft=11.2.1.3=0
|
53
|
+
- libcufile=1.9.1.3=0
|
54
|
+
- libcurand=10.3.5.147=0
|
55
|
+
- libcusolver=11.6.1.9=0
|
56
|
+
- libcusparse=12.3.1.170=0
|
57
|
+
- libdeflate=1.22=h5eee18b_0
|
58
|
+
- libffi=3.4.4=h6a678d5_1
|
59
|
+
- libgcc-devel_linux-64=11.2.0=h1234567_1
|
123
60
|
- libgcc-ng=11.2.0=h1234567_1
|
124
|
-
- libgfortran-ng=12.2.0=h69a702a_19
|
125
|
-
- libgfortran5=12.2.0=h337968e_19
|
126
61
|
- libgomp=11.2.0=h1234567_1
|
127
|
-
- libiconv=1.16=
|
128
|
-
- libidn2=2.3.
|
129
|
-
-
|
130
|
-
- libnpp=
|
131
|
-
-
|
132
|
-
-
|
133
|
-
- libnvjpeg
|
134
|
-
- libpng=1.6.
|
135
|
-
- libprotobuf=
|
136
|
-
-
|
62
|
+
- libiconv=1.16=h5eee18b_3
|
63
|
+
- libidn2=2.3.4=h5eee18b_0
|
64
|
+
- libjpeg-turbo=2.0.0=h9bf148f_0
|
65
|
+
- libnpp=12.2.5.30=0
|
66
|
+
- libnvfatbin=12.4.127=0
|
67
|
+
- libnvjitlink=12.4.127=0
|
68
|
+
- libnvjpeg=12.3.1.117=0
|
69
|
+
- libpng=1.6.39=h5eee18b_0
|
70
|
+
- libprotobuf=4.25.3=he621ea3_0
|
71
|
+
- libstdcxx-devel_linux-64=11.2.0=h1234567_1
|
137
72
|
- libstdcxx-ng=11.2.0=h1234567_1
|
138
|
-
- libtasn1=4.
|
139
|
-
-
|
140
|
-
- libtiff=4.4.0=hecacb30_0
|
73
|
+
- libtasn1=4.19.0=h5eee18b_0
|
74
|
+
- libtiff=4.5.1=hffd6297_1
|
141
75
|
- libunistring=0.9.10=h27cfd23_0
|
142
|
-
- libuuid=1.
|
143
|
-
- libwebp=1.2
|
144
|
-
- libwebp-base=1.2
|
145
|
-
-
|
146
|
-
-
|
147
|
-
- markupsafe=2.1.
|
148
|
-
- mkl=
|
149
|
-
- mkl-service=2.4.0=
|
150
|
-
- mkl_fft=1.3.
|
151
|
-
- mkl_random=1.2.
|
152
|
-
-
|
153
|
-
-
|
154
|
-
-
|
155
|
-
- ncurses=6.
|
76
|
+
- libuuid=1.41.5=h5eee18b_0
|
77
|
+
- libwebp=1.3.2=h11a3e52_0
|
78
|
+
- libwebp-base=1.3.2=h5eee18b_1
|
79
|
+
- llvm-openmp=14.0.6=h9e868ea_0
|
80
|
+
- lz4-c=1.9.4=h6a678d5_1
|
81
|
+
- markupsafe=2.1.3=py311h5eee18b_0
|
82
|
+
- mkl=2023.1.0=h213fc3f_46344
|
83
|
+
- mkl-service=2.4.0=py311h5eee18b_1
|
84
|
+
- mkl_fft=1.3.11=py311h5eee18b_0
|
85
|
+
- mkl_random=1.2.8=py311ha02d727_0
|
86
|
+
- mpc=1.1.0=h10f8cd9_1
|
87
|
+
- mpfr=4.0.2=hb69a4c5_1
|
88
|
+
- mpmath=1.3.0=py311h06a4308_0
|
89
|
+
- ncurses=6.4=h6a678d5_0
|
156
90
|
- nettle=3.7.3=hbbd107a_1
|
157
|
-
-
|
158
|
-
-
|
159
|
-
- numpy=
|
160
|
-
#- numpy-base=1.23.3=py310h8e6c178_0
|
161
|
-
- oauthlib=3.2.2=pyhd8ed1ab_0
|
91
|
+
- networkx=3.2.1=py311h06a4308_0
|
92
|
+
- numpy=2.0.1=py311h08b1b3b_1
|
93
|
+
- numpy-base=2.0.1=py311hf175353_1
|
162
94
|
- openh264=2.1.1=h4ff587b_0
|
163
|
-
-
|
164
|
-
-
|
165
|
-
-
|
166
|
-
-
|
167
|
-
-
|
168
|
-
-
|
169
|
-
-
|
170
|
-
-
|
171
|
-
- pyasn1=0.4.8=py_0
|
172
|
-
- pyasn1-modules=0.2.7=py_0
|
173
|
-
- pycparser=2.21=pyhd3eb1b0_0
|
174
|
-
- pyjwt=2.6.0=pyhd8ed1ab_0
|
175
|
-
- pyopenssl=22.0.0=pyhd3eb1b0_0
|
176
|
-
- pyparsing=3.0.9=pyhd8ed1ab_0
|
177
|
-
- pysocks=1.7.1=py310h06a4308_0
|
178
|
-
- python=3.10.6=haa1d7c7_1
|
179
|
-
- python-dateutil=2.8.2=pyhd8ed1ab_0
|
180
|
-
- python-xxhash=3.0.0=py310h5764c6d_1
|
181
|
-
- python_abi=3.10=2_cp310
|
182
|
-
- pytorch=1.13.0=py3.10_cuda11.7_cudnn8.5.0_0
|
183
|
-
- pytorch-cuda=11.7=h67b0de4_0
|
95
|
+
- openjpeg=2.5.2=he7f1fd0_0
|
96
|
+
- openssl=3.0.15=h5eee18b_0
|
97
|
+
- pillow=11.0.0=py311hcea889d_1
|
98
|
+
- pip=24.2=py311h06a4308_0
|
99
|
+
- pysocks=1.7.1=py311h06a4308_0
|
100
|
+
- python=3.11.11=he870216_0
|
101
|
+
- pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0
|
102
|
+
- pytorch-cuda=12.4=hc786d27_7
|
184
103
|
- pytorch-mutex=1.0=cuda
|
185
|
-
-
|
186
|
-
- pyu2f=0.1.5=pyhd8ed1ab_0
|
187
|
-
- pyyaml=6.0=py310h5764c6d_4
|
188
|
-
- re2=2022.04.01=h27087fc_0
|
104
|
+
- pyyaml=6.0.2=py311h5eee18b_0
|
189
105
|
- readline=8.2=h5eee18b_0
|
190
|
-
-
|
191
|
-
-
|
192
|
-
-
|
193
|
-
-
|
194
|
-
-
|
195
|
-
-
|
196
|
-
-
|
197
|
-
-
|
198
|
-
-
|
199
|
-
-
|
200
|
-
-
|
201
|
-
-
|
202
|
-
-
|
203
|
-
-
|
204
|
-
-
|
205
|
-
-
|
206
|
-
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
106
|
+
- requests=2.32.3=py311h06a4308_1
|
107
|
+
- setuptools=75.1.0=py311h06a4308_0
|
108
|
+
- sqlite=3.45.3=h5eee18b_0
|
109
|
+
- sysroot_linux-64=2.17=h57e8cba_10
|
110
|
+
- tbb=2021.8.0=hdb19cb5_0
|
111
|
+
- tk=8.6.14=h39e8969_0
|
112
|
+
- torchaudio=2.5.1=py311_cu124
|
113
|
+
- torchtriton=3.1.0=py311
|
114
|
+
- torchvision=0.20.1=py311_cu124
|
115
|
+
- typing_extensions=4.12.2=py311h06a4308_0
|
116
|
+
- urllib3=2.2.3=py311h06a4308_0
|
117
|
+
- wheel=0.44.0=py311h06a4308_0
|
118
|
+
- xz=5.4.6=h5eee18b_1
|
119
|
+
- yaml=0.2.5=h7b6447c_0
|
120
|
+
- zlib=1.2.13=h5eee18b_1
|
121
|
+
- zstd=1.5.6=hc292b87_0
|
122
|
+
- pip:
|
123
|
+
- absl-py==2.1.0
|
124
|
+
- accelerate==1.2.1
|
125
|
+
- aiohappyeyeballs==2.4.4
|
126
|
+
- aiohttp==3.11.11
|
127
|
+
- aiosignal==1.3.2
|
128
|
+
- annotated-types==0.7.0
|
129
|
+
- attrs==24.3.0
|
130
|
+
- datasets==3.2.0
|
131
|
+
- deepspeed==0.16.2
|
132
|
+
- dill==0.3.8
|
133
|
+
- einops==0.8.0
|
134
|
+
- flash-attn==2.7.2.post1
|
135
|
+
- frozenlist==1.5.0
|
136
|
+
- grpcio==1.70.0
|
137
|
+
- hjson==3.1.0
|
138
|
+
- huggingface-hub==0.27.0
|
139
|
+
- joblib==1.4.2
|
140
|
+
- markdown==3.7
|
141
|
+
- markdown-it-py==3.0.0
|
142
|
+
- mdurl==0.1.2
|
143
|
+
- mpi4py==4.0.1
|
144
|
+
- msgpack==1.1.0
|
145
|
+
- multidict==6.1.0
|
146
|
+
- multiprocess==0.70.16
|
147
|
+
- natsort==8.4.0
|
148
|
+
- ninja==1.11.1.3
|
149
|
+
- nvidia-ml-py==12.560.30
|
150
|
+
- packaging==24.2
|
151
|
+
- pandas==2.2.3
|
152
|
+
- peft==0.14.0
|
153
|
+
- propcache==0.2.1
|
154
|
+
- protobuf==6.30.0
|
155
|
+
- psutil==6.1.1
|
156
|
+
- py-cpuinfo==9.0.0
|
157
|
+
- pyarrow==18.1.0
|
158
|
+
- pydantic==2.10.4
|
159
|
+
- pydantic-core==2.27.2
|
160
|
+
- pygments==2.18.0
|
161
|
+
- python-dateutil==2.9.0.post0
|
162
|
+
- pytz==2024.2
|
163
|
+
- regex==2024.11.6
|
164
|
+
- rich==13.9.4
|
165
|
+
- safetensors==0.4.5
|
166
|
+
- scikit-learn==1.6.1
|
167
|
+
- scipy==1.15.2
|
168
|
+
- seqeval==1.2.2
|
169
|
+
- six==1.17.0
|
170
|
+
- sympy==1.13.1
|
171
|
+
- tensorboard==2.19.0
|
172
|
+
- tensorboard-data-server==0.7.2
|
173
|
+
- threadpoolctl==3.5.0
|
174
|
+
- tokenizers==0.21.0
|
175
|
+
- tqdm==4.67.1
|
176
|
+
- transformers==4.47.1
|
177
|
+
- trl==0.12.0
|
178
|
+
- tzdata==2024.2
|
179
|
+
- werkzeug==3.1.3
|
180
|
+
- xxhash==3.5.0
|
181
|
+
- yarl==1.18.3
|
182
|
+
|
sinatools/ner/__init__.py
CHANGED
@@ -11,7 +11,7 @@ from argparse import Namespace
|
|
11
11
|
tagger = None
|
12
12
|
tag_vocab = None
|
13
13
|
train_config = None
|
14
|
-
|
14
|
+
print("ner started")
|
15
15
|
filename = 'Wj27012000.tar'
|
16
16
|
path =downloader.get_appdatadir()
|
17
17
|
model_path = os.path.join(path, filename)
|
@@ -20,19 +20,21 @@ _path = os.path.join(model_path, "tag_vocab.pkl")
|
|
20
20
|
|
21
21
|
with open(_path, "rb") as fh:
|
22
22
|
tag_vocab = pickle.load(fh)
|
23
|
+
print("tag_vocab loaded")
|
23
24
|
|
24
25
|
train_config = Namespace()
|
25
26
|
args_path = os.path.join(model_path, "args.json")
|
26
|
-
|
27
|
+
print("args loaded")
|
27
28
|
with open(args_path, "r") as fh:
|
28
29
|
train_config.__dict__ = json.load(fh)
|
29
|
-
|
30
|
+
print("steps 1")
|
30
31
|
model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
|
31
32
|
model = torch.nn.DataParallel(model)
|
32
|
-
|
33
|
+
print("steps 2")
|
33
34
|
if torch.cuda.is_available():
|
34
35
|
model = model.cuda()
|
35
|
-
|
36
|
+
print("steps 3")
|
36
37
|
train_config.trainer_config["kwargs"]["model"] = model
|
37
38
|
tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
|
38
39
|
tagger.load(os.path.join(model_path,"checkpoints"))
|
40
|
+
print("steps 4")
|
sinatools/ner/data/datasets.py
CHANGED
@@ -37,7 +37,11 @@ class Token:
|
|
37
37
|
:return: str
|
38
38
|
"""
|
39
39
|
gold_tags = "|".join(self.gold_tag)
|
40
|
-
|
40
|
+
|
41
|
+
if self.pred_tag:
|
42
|
+
pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
|
43
|
+
else:
|
44
|
+
pred_tags = ""
|
41
45
|
|
42
46
|
if self.gold_tag:
|
43
47
|
r = f"{self.text}\t{gold_tags}\t{pred_tags}"
|
@@ -139,8 +143,8 @@ class NestedTagsDataset(Dataset):
|
|
139
143
|
masks = torch.cat(masks)
|
140
144
|
|
141
145
|
# Pad the tags, do the padding for each tag type
|
142
|
-
tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["
|
146
|
+
tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["O"])(tag)
|
143
147
|
for tag, vocab in zip(tags, self.vocab.tags[1:])]
|
144
148
|
tags = torch.cat(tags)
|
145
149
|
|
146
|
-
return subwords, tags, tokens, masks, valid_len
|
150
|
+
return subwords, tags, tokens, masks, valid_len
|
sinatools/ner/data_format.py
CHANGED
@@ -1,16 +1,30 @@
|
|
1
1
|
from torch.utils.data import DataLoader
|
2
|
-
from torchtext.vocab import vocab
|
3
2
|
from collections import Counter, namedtuple
|
4
3
|
import logging
|
5
4
|
import re
|
6
5
|
import itertools
|
7
6
|
from sinatools.ner.helpers import load_object
|
8
|
-
from sinatools.ner.datasets import Token
|
9
|
-
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
7
|
+
from sinatools.ner.data.datasets import Token
|
10
8
|
|
11
9
|
logger = logging.getLogger(__name__)
|
12
10
|
|
13
11
|
|
12
|
+
class Vocab:
|
13
|
+
def __init__(self, counter, specials=[]) -> None:
|
14
|
+
self.itos = list(counter.keys()) + specials
|
15
|
+
self.stoi = {s: i for i, s in enumerate(self.itos)}
|
16
|
+
self.word_count = counter
|
17
|
+
|
18
|
+
def get_itos(self) -> list[str]:
|
19
|
+
return self.itos
|
20
|
+
|
21
|
+
def get_stoi(self) -> dict[str, int]:
|
22
|
+
return self.stoi
|
23
|
+
|
24
|
+
def __len__(self):
|
25
|
+
return len(self.itos)
|
26
|
+
|
27
|
+
|
14
28
|
def conll_to_segments(filename):
|
15
29
|
"""
|
16
30
|
Convert CoNLL files to segments. This return list of segments and each segment is
|
@@ -60,8 +74,8 @@ def parse_conll_files(data_paths):
|
|
60
74
|
|
61
75
|
# Generate vocabs for tags and tokens
|
62
76
|
tag_vocabs = tag_vocab_by_type(tags)
|
63
|
-
tag_vocabs.insert(0,
|
64
|
-
vocabs = vocabs(tokens=
|
77
|
+
tag_vocabs.insert(0, Vocab(Counter(tags)))
|
78
|
+
vocabs = vocabs(tokens=Vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
|
65
79
|
return tuple(datasets), vocabs
|
66
80
|
|
67
81
|
|
@@ -72,9 +86,9 @@ def tag_vocab_by_type(tags):
|
|
72
86
|
tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
|
73
87
|
|
74
88
|
for tag_type in tag_types:
|
75
|
-
r = re.compile(".*-" + tag_type)
|
89
|
+
r = re.compile(".*-" + tag_type + "$")
|
76
90
|
t = list(filter(r.match, tags)) + ["O"]
|
77
|
-
vocabs.append(
|
91
|
+
vocabs.append(Vocab(Counter(t)))
|
78
92
|
|
79
93
|
return vocabs
|
80
94
|
|
@@ -83,13 +97,11 @@ def text2segments(text):
|
|
83
97
|
"""
|
84
98
|
Convert text to a datasets and index the tokens
|
85
99
|
"""
|
86
|
-
|
87
|
-
list_of_tokens = simple_word_tokenize(text)
|
88
|
-
dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
|
100
|
+
dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
|
89
101
|
tokens = [token.text for segment in dataset for token in segment]
|
90
102
|
|
91
103
|
# Generate vocabs for the tokens
|
92
|
-
segment_vocab =
|
104
|
+
segment_vocab = Vocab(Counter(tokens), specials=["UNK"])
|
93
105
|
return dataset, segment_vocab
|
94
106
|
|
95
107
|
|
@@ -121,4 +133,4 @@ def get_dataloaders(
|
|
121
133
|
logger.info("%s batches found", len(dataloader))
|
122
134
|
dataloaders.append(dataloader)
|
123
135
|
|
124
|
-
return dataloaders
|
136
|
+
return dataloaders
|
sinatools/ner/helpers.py
CHANGED
@@ -4,8 +4,11 @@ import logging
|
|
4
4
|
import importlib
|
5
5
|
import shutil
|
6
6
|
import torch
|
7
|
+
import pickle
|
8
|
+
import json
|
7
9
|
import random
|
8
10
|
import numpy as np
|
11
|
+
from argparse import Namespace
|
9
12
|
|
10
13
|
|
11
14
|
def logging_config(log_file=None):
|
@@ -30,24 +33,17 @@ def logging_config(log_file=None):
|
|
30
33
|
|
31
34
|
|
32
35
|
def load_object(name, kwargs):
|
36
|
+
"""
|
37
|
+
Load objects dynamically given the object name and its arguments
|
38
|
+
:param name: str - object name, class name or function name
|
39
|
+
:param kwargs: dict - keyword arguments
|
40
|
+
:return: object
|
41
|
+
"""
|
42
|
+
object_module, object_name = name.rsplit(".", 1)
|
43
|
+
object_module = importlib.import_module(object_module)
|
44
|
+
fn = getattr(object_module, object_name)(**kwargs)
|
45
|
+
return fn
|
33
46
|
|
34
|
-
try:
|
35
|
-
object_module, object_name = name.rsplit(".", 1)
|
36
|
-
object_module = importlib.import_module(object_module)
|
37
|
-
obj = getattr(object_module, object_name)
|
38
|
-
if callable(obj):
|
39
|
-
fn = obj(**kwargs)
|
40
|
-
return fn
|
41
|
-
else:
|
42
|
-
raise TypeError(f"{name} is not a callable object.")
|
43
|
-
except (ImportError, ModuleNotFoundError) as e:
|
44
|
-
print(f"Error importing module: {e}")
|
45
|
-
except AttributeError as e:
|
46
|
-
print(f"Attribute error: {e}")
|
47
|
-
except Exception as e:
|
48
|
-
print(f"An error occurred: {e}")
|
49
|
-
|
50
|
-
return None
|
51
47
|
|
52
48
|
def make_output_dirs(path, subdirs=[], overwrite=True):
|
53
49
|
"""
|
@@ -66,6 +62,41 @@ def make_output_dirs(path, subdirs=[], overwrite=True):
|
|
66
62
|
os.makedirs(os.path.join(path, subdir))
|
67
63
|
|
68
64
|
|
65
|
+
def load_checkpoint(model_path):
|
66
|
+
"""
|
67
|
+
Load model given the model path
|
68
|
+
:param model_path: str - path to model
|
69
|
+
:return: tagger - arabiner.trainers.BaseTrainer - the tagger model
|
70
|
+
vocab - arabicner.utils.data.Vocab - indexed tags
|
71
|
+
train_config - argparse.Namespace - training configurations
|
72
|
+
"""
|
73
|
+
with open(os.path.join(model_path, "tag_vocab.pkl"), "rb") as fh:
|
74
|
+
tag_vocab = pickle.load(fh)
|
75
|
+
|
76
|
+
# Load train configurations from checkpoint
|
77
|
+
train_config = Namespace()
|
78
|
+
with open(os.path.join(model_path, "args.json"), "r") as fh:
|
79
|
+
train_config.__dict__ = json.load(fh)
|
80
|
+
|
81
|
+
# Initialize the loss function, not used for inference, but evaluation
|
82
|
+
loss = load_object(train_config.loss["fn"], train_config.loss["kwargs"])
|
83
|
+
|
84
|
+
# Load BERT tagger
|
85
|
+
model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
|
86
|
+
model = torch.nn.DataParallel(model)
|
87
|
+
|
88
|
+
if torch.cuda.is_available():
|
89
|
+
model = model.cuda()
|
90
|
+
|
91
|
+
# Update arguments for the tagger
|
92
|
+
# Attach the model, loss (used for evaluations cases)
|
93
|
+
train_config.trainer_config["kwargs"]["model"] = model
|
94
|
+
train_config.trainer_config["kwargs"]["loss"] = loss
|
95
|
+
|
96
|
+
tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
|
97
|
+
tagger.load(os.path.join(model_path, "checkpoints"))
|
98
|
+
return tagger, tag_vocab, train_config
|
99
|
+
|
69
100
|
|
70
101
|
def set_seed(seed):
|
71
102
|
"""
|
@@ -83,4 +114,4 @@ def set_seed(seed):
|
|
83
114
|
|
84
115
|
torch.backends.cudnn.deterministic = True
|
85
116
|
torch.backends.cudnn.benchmark = False
|
86
|
-
torch.backends.cudnn.enabled = False
|
117
|
+
torch.backends.cudnn.enabled = False
|
@@ -113,5 +113,5 @@ class BaseTrainer:
|
|
113
113
|
logger.info("Loading checkpoint %s", checkpoint_path)
|
114
114
|
|
115
115
|
device = None if torch.cuda.is_available() else torch.device('cpu')
|
116
|
-
checkpoint = torch.load(checkpoint_path, map_location=device)
|
117
|
-
self.model.load_state_dict(checkpoint["model"], strict=False)
|
116
|
+
checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
|
117
|
+
self.model.load_state_dict(checkpoint["model"], strict=False)
|
@@ -1,227 +0,0 @@
|
|
1
|
-
name: arabicner
|
2
|
-
channels:
|
3
|
-
- anaconda
|
4
|
-
- pytorch
|
5
|
-
- nvidia
|
6
|
-
- conda-forge
|
7
|
-
- defaults
|
8
|
-
dependencies:
|
9
|
-
- _libgcc_mutex=0.1=main
|
10
|
-
- _openmp_mutex=5.1=1_gnu
|
11
|
-
- abseil-cpp=20211102.0=h27087fc_1
|
12
|
-
- absl-py=1.3.0=pyhd8ed1ab_0
|
13
|
-
- aiohttp=3.8.1=py310h5764c6d_1
|
14
|
-
- aiosignal=1.2.0=pyhd8ed1ab_0
|
15
|
-
- arrow-cpp=8.0.0=py310h3098874_0
|
16
|
-
- async-timeout=4.0.2=py310h06a4308_0
|
17
|
-
- attrs=22.1.0=pyh71513ae_1
|
18
|
-
- aws-c-common=0.4.57=he6710b0_1
|
19
|
-
- aws-c-event-stream=0.1.6=h2531618_5
|
20
|
-
- aws-checksums=0.1.9=he6710b0_0
|
21
|
-
- aws-sdk-cpp=1.8.185=hce553d0_0
|
22
|
-
- blas=1.0=mkl
|
23
|
-
- blinker=1.5=pyhd8ed1ab_0
|
24
|
-
- boost-cpp=1.78.0=he72f1d9_0
|
25
|
-
- bottleneck=1.3.5=py310ha9d4c09_0
|
26
|
-
- brotli=1.0.9=h166bdaf_7
|
27
|
-
- brotli-bin=1.0.9=h166bdaf_7
|
28
|
-
- brotlipy=0.7.0=py310h7f8727e_1002
|
29
|
-
- bzip2=1.0.8=h7b6447c_0
|
30
|
-
- c-ares=1.18.1=h7f98852_0
|
31
|
-
- ca-certificates=2022.9.24=ha878542_0
|
32
|
-
- cachetools=5.2.0=pyhd8ed1ab_0
|
33
|
-
- certifi=2022.9.24=pyhd8ed1ab_0
|
34
|
-
- cffi=1.15.1=py310h74dc2b5_0
|
35
|
-
- charset-normalizer=2.0.4=pyhd3eb1b0_0
|
36
|
-
- click=8.1.3=unix_pyhd8ed1ab_2
|
37
|
-
- cryptography=38.0.1=py310h9ce1e76_0
|
38
|
-
- cuda=11.7.1=0
|
39
|
-
- cuda-cccl=11.7.91=0
|
40
|
-
- cuda-command-line-tools=11.7.1=0
|
41
|
-
- cuda-compiler=11.7.1=0
|
42
|
-
- cuda-cudart=11.7.99=0
|
43
|
-
- cuda-cudart-dev=11.7.99=0
|
44
|
-
- cuda-cuobjdump=11.7.91=0
|
45
|
-
- cuda-cupti=11.7.101=0
|
46
|
-
- cuda-cuxxfilt=11.7.91=0
|
47
|
-
- cuda-demo-suite=11.8.86=0
|
48
|
-
- cuda-documentation=11.8.86=0
|
49
|
-
- cuda-driver-dev=11.7.99=0
|
50
|
-
- cuda-gdb=11.8.86=0
|
51
|
-
- cuda-libraries=11.7.1=0
|
52
|
-
- cuda-libraries-dev=11.7.1=0
|
53
|
-
- cuda-memcheck=11.8.86=0
|
54
|
-
- cuda-nsight=11.8.86=0
|
55
|
-
- cuda-nsight-compute=11.8.0=0
|
56
|
-
- cuda-nvcc=11.7.99=0
|
57
|
-
- cuda-nvdisasm=11.8.86=0
|
58
|
-
- cuda-nvml-dev=11.7.91=0
|
59
|
-
- cuda-nvprof=11.8.87=0
|
60
|
-
- cuda-nvprune=11.7.91=0
|
61
|
-
- cuda-nvrtc=11.7.99=0
|
62
|
-
- cuda-nvrtc-dev=11.7.99=0
|
63
|
-
- cuda-nvtx=11.7.91=0
|
64
|
-
- cuda-nvvp=11.8.87=0
|
65
|
-
- cuda-runtime=11.7.1=0
|
66
|
-
- cuda-sanitizer-api=11.8.86=0
|
67
|
-
- cuda-toolkit=11.7.1=0
|
68
|
-
- cuda-tools=11.7.1=0
|
69
|
-
- cuda-visual-tools=11.7.1=0
|
70
|
-
- dataclasses=0.8=pyhc8e2a94_3
|
71
|
-
- datasets=2.6.1=pyhd8ed1ab_0
|
72
|
-
- dill=0.3.5.1=pyhd8ed1ab_0
|
73
|
-
- ffmpeg=4.3=hf484d3e_0
|
74
|
-
- fftw=3.3.10=nompi_h77c792f_102
|
75
|
-
- filelock=3.8.0=pyhd8ed1ab_0
|
76
|
-
- freetype=2.12.1=h4a9f257_0
|
77
|
-
- frozenlist=1.2.0=py310h7f8727e_1
|
78
|
-
- fsspec=2022.10.0=pyhd8ed1ab_0
|
79
|
-
- gds-tools=1.4.0.31=0
|
80
|
-
- gflags=2.2.2=he1b5a44_1004
|
81
|
-
- giflib=5.2.1=h7b6447c_0
|
82
|
-
- glog=0.6.0=h6f12383_0
|
83
|
-
- gmp=6.2.1=h295c915_3
|
84
|
-
- gnutls=3.6.15=he1e5248_0
|
85
|
-
#- google-auth=2.14.0=pyh1a96a4e_0
|
86
|
-
#- google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
|
87
|
-
- grpc-cpp=1.46.1=h33aed49_0
|
88
|
-
- grpcio=1.42.0=py310hce63b2e_0
|
89
|
-
- huggingface_hub=0.10.1=pyhd8ed1ab_0
|
90
|
-
- icu=70.1=h27087fc_0
|
91
|
-
- idna=3.4=py310h06a4308_0
|
92
|
-
- importlib-metadata=5.0.0=pyha770c72_1
|
93
|
-
- importlib_metadata=5.0.0=hd8ed1ab_1
|
94
|
-
- intel-openmp=2021.4.0=h06a4308_3561
|
95
|
-
- joblib=1.2.0=pyhd8ed1ab_0
|
96
|
-
- jpeg=9e=h7f8727e_0
|
97
|
-
- keyutils=1.6.1=h166bdaf_0
|
98
|
-
- krb5=1.19.3=h3790be6_0
|
99
|
-
- lame=3.100=h7b6447c_0
|
100
|
-
- lcms2=2.12=h3be6417_0
|
101
|
-
- ld_impl_linux-64=2.38=h1181459_1
|
102
|
-
- lerc=3.0=h295c915_0
|
103
|
-
- libbrotlicommon=1.0.9=h166bdaf_7
|
104
|
-
- libbrotlidec=1.0.9=h166bdaf_7
|
105
|
-
- libbrotlienc=1.0.9=h166bdaf_7
|
106
|
-
- libcublas=11.11.3.6=0
|
107
|
-
- libcublas-dev=11.11.3.6=0
|
108
|
-
- libcufft=10.9.0.58=0
|
109
|
-
- libcufft-dev=10.9.0.58=0
|
110
|
-
- libcufile=1.4.0.31=0
|
111
|
-
- libcufile-dev=1.4.0.31=0
|
112
|
-
- libcurand=10.3.0.86=0
|
113
|
-
- libcurl=7.85.0=h91b91d3_0
|
114
|
-
- libcusolver=11.4.1.48=0
|
115
|
-
- libcusolver-dev=11.4.1.48=0
|
116
|
-
- libcusparse=11.7.5.86=0
|
117
|
-
- libcusparse-dev=11.7.5.86=0
|
118
|
-
- libdeflate=1.8=h7f8727e_5
|
119
|
-
- libedit=3.1.20191231=he28a2e2_2
|
120
|
-
- libev=4.33=h516909a_1
|
121
|
-
- libevent=2.1.10=h9b69904_4
|
122
|
-
- libffi=3.3=he6710b0_2
|
123
|
-
- libgcc-ng=11.2.0=h1234567_1
|
124
|
-
- libgfortran-ng=12.2.0=h69a702a_19
|
125
|
-
- libgfortran5=12.2.0=h337968e_19
|
126
|
-
- libgomp=11.2.0=h1234567_1
|
127
|
-
- libiconv=1.16=h7f8727e_2
|
128
|
-
- libidn2=2.3.2=h7f8727e_0
|
129
|
-
- libnghttp2=1.46.0=hce63b2e_0
|
130
|
-
- libnpp=11.8.0.86=0
|
131
|
-
- libnpp-dev=11.8.0.86=0
|
132
|
-
- libnvjpeg=11.9.0.86=0
|
133
|
-
- libnvjpeg-dev=11.9.0.86=0
|
134
|
-
- libpng=1.6.37=hbc83047_0
|
135
|
-
- libprotobuf=3.20.1=h4ff587b_0
|
136
|
-
- libssh2=1.10.0=ha56f1ee_2
|
137
|
-
- libstdcxx-ng=11.2.0=h1234567_1
|
138
|
-
- libtasn1=4.16.0=h27cfd23_0
|
139
|
-
- libthrift=0.15.0=he6d91bd_0
|
140
|
-
- libtiff=4.4.0=hecacb30_0
|
141
|
-
- libunistring=0.9.10=h27cfd23_0
|
142
|
-
- libuuid=1.0.3=h7f8727e_2
|
143
|
-
- libwebp=1.2.4=h11a3e52_0
|
144
|
-
- libwebp-base=1.2.4=h5eee18b_0
|
145
|
-
- lz4-c=1.9.3=h295c915_1
|
146
|
-
- markdown=3.4.1=pyhd8ed1ab_0
|
147
|
-
- markupsafe=2.1.1=py310h5764c6d_1
|
148
|
-
- mkl=2021.4.0=h06a4308_640
|
149
|
-
- mkl-service=2.4.0=py310h7f8727e_0
|
150
|
-
- mkl_fft=1.3.1=py310hd6ae3a3_0
|
151
|
-
- mkl_random=1.2.2=py310h00e6091_0
|
152
|
-
- multidict=6.0.2=py310h5764c6d_1
|
153
|
-
- multiprocess=0.70.12.2=py310h5764c6d_2
|
154
|
-
- natsort=7.1.1=pyhd3eb1b0_0
|
155
|
-
- ncurses=6.3=h5eee18b_3
|
156
|
-
- nettle=3.7.3=hbbd107a_1
|
157
|
-
- nsight-compute=2022.3.0.22=0
|
158
|
-
- numexpr=2.8.3=py310hcea2de6_0
|
159
|
-
- numpy=1.23.3=py310hd5efca6_0
|
160
|
-
#- numpy-base=1.23.3=py310h8e6c178_0
|
161
|
-
- oauthlib=3.2.2=pyhd8ed1ab_0
|
162
|
-
- openh264=2.1.1=h4ff587b_0
|
163
|
-
- openssl=1.1.1s=h7f8727e_0
|
164
|
-
- orc=1.7.4=h07ed6aa_0
|
165
|
-
- packaging=21.3=pyhd8ed1ab_0
|
166
|
-
- pandas=1.4.4=py310h6a678d5_0
|
167
|
-
- pillow=9.2.0=py310hace64e9_1
|
168
|
-
- pip=22.2.2=py310h06a4308_0
|
169
|
-
- protobuf=3.20.1=py310hd8f1fbe_0
|
170
|
-
- pyarrow=8.0.0=py310h468efa6_0
|
171
|
-
- pyasn1=0.4.8=py_0
|
172
|
-
- pyasn1-modules=0.2.7=py_0
|
173
|
-
- pycparser=2.21=pyhd3eb1b0_0
|
174
|
-
- pyjwt=2.6.0=pyhd8ed1ab_0
|
175
|
-
- pyopenssl=22.0.0=pyhd3eb1b0_0
|
176
|
-
- pyparsing=3.0.9=pyhd8ed1ab_0
|
177
|
-
- pysocks=1.7.1=py310h06a4308_0
|
178
|
-
- python=3.10.6=haa1d7c7_1
|
179
|
-
- python-dateutil=2.8.2=pyhd8ed1ab_0
|
180
|
-
- python-xxhash=3.0.0=py310h5764c6d_1
|
181
|
-
- python_abi=3.10=2_cp310
|
182
|
-
- pytorch=1.13.0=py3.10_cuda11.7_cudnn8.5.0_0
|
183
|
-
- pytorch-cuda=11.7=h67b0de4_0
|
184
|
-
- pytorch-mutex=1.0=cuda
|
185
|
-
- pytz=2022.6=pyhd8ed1ab_0
|
186
|
-
- pyu2f=0.1.5=pyhd8ed1ab_0
|
187
|
-
- pyyaml=6.0=py310h5764c6d_4
|
188
|
-
- re2=2022.04.01=h27087fc_0
|
189
|
-
- readline=8.2=h5eee18b_0
|
190
|
-
- regex=2022.7.9=py310h5eee18b_0
|
191
|
-
- requests=2.28.1=py310h06a4308_0
|
192
|
-
- requests-oauthlib=1.3.1=pyhd8ed1ab_0
|
193
|
-
- responses=0.18.0=pyhd8ed1ab_0
|
194
|
-
- rsa=4.9=pyhd8ed1ab_0
|
195
|
-
- sacremoses=0.0.53=pyhd8ed1ab_0
|
196
|
-
- scikit-learn=1.1.3=py310h6a678d5_0
|
197
|
-
- scipy=1.9.3=py310hd5efca6_0
|
198
|
-
- seqeval=1.2.2=pyhd3deb0d_0
|
199
|
-
- setuptools=65.4.0=py310h06a4308_0
|
200
|
-
- six=1.16.0=pyhd3eb1b0_1
|
201
|
-
- snappy=1.1.9=hbd366e4_1
|
202
|
-
- sqlite=3.39.3=h5082296_0
|
203
|
-
- tensorboard=2.10.1=pyhd8ed1ab_0
|
204
|
-
- tensorboard-data-server=0.6.0=py310h597c629_2
|
205
|
-
- tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
|
206
|
-
- threadpoolctl=3.1.0=pyh8a188c0_0
|
207
|
-
- tk=8.6.12=h1ccaba5_0
|
208
|
-
- tokenizers=0.11.4=py310h3dcd8bd_1
|
209
|
-
- torchaudio=0.13.0=py310_cu117
|
210
|
-
- torchtext=0.14.0=py310
|
211
|
-
- torchvision=0.14.0=py310_cu117
|
212
|
-
- tqdm=4.64.1=py310h06a4308_0
|
213
|
-
- transformers=4.24.0=pyhd8ed1ab_0
|
214
|
-
- typing-extensions=4.3.0=py310h06a4308_0
|
215
|
-
- typing_extensions=4.3.0=py310h06a4308_0
|
216
|
-
- tzdata=2022e=h04d1e81_0
|
217
|
-
- urllib3=1.26.12=py310h06a4308_0
|
218
|
-
- utf8proc=2.6.1=h27cfd23_0
|
219
|
-
- werkzeug=2.2.2=pyhd8ed1ab_0
|
220
|
-
- wheel=0.37.1=pyhd3eb1b0_0
|
221
|
-
- xxhash=0.8.0=h7f98852_3
|
222
|
-
- xz=5.2.6=h5eee18b_0
|
223
|
-
- yaml=0.2.5=h7f98852_2
|
224
|
-
- yarl=1.7.2=py310h5764c6d_2
|
225
|
-
- zipp=3.10.0=pyhd8ed1ab_0
|
226
|
-
- zlib=1.2.13=h5eee18b_0
|
227
|
-
- zstd=1.5.2=ha4553b6_0
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|