SinaTools 0.1.38__py2.py3-none-any.whl → 0.1.40__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,182 @@
1
+ name: dev
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - defaults
6
+ - https://repo.anaconda.com/pkgs/main
7
+ - https://repo.anaconda.com/pkgs/r
8
+ dependencies:
9
+ - _libgcc_mutex=0.1=main
10
+ - _openmp_mutex=5.1=1_gnu
11
+ - _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
12
+ - binutils_impl_linux-64=2.40=h5293946_0
13
+ - binutils_linux-64=2.40.0=hc2dff05_1
14
+ - blas=1.0=mkl
15
+ - brotli-python=1.0.9=py311h6a678d5_8
16
+ - bzip2=1.0.8=h5eee18b_6
17
+ - ca-certificates=2024.11.26=h06a4308_0
18
+ - certifi=2024.12.14=py311h06a4308_0
19
+ - charset-normalizer=3.3.2=pyhd3eb1b0_0
20
+ - cuda-cudart=12.4.127=0
21
+ - cuda-cupti=12.4.127=0
22
+ - cuda-libraries=12.4.1=0
23
+ - cuda-nvrtc=12.4.127=0
24
+ - cuda-nvtx=12.4.127=0
25
+ - cuda-opencl=12.4.127=0
26
+ - cuda-runtime=12.4.1=0
27
+ - cuda-version=11.7=h6a555f7_3
28
+ - cudatoolkit=11.7.0=hd8887f6_10
29
+ - ffmpeg=4.3=hf484d3e_0
30
+ - filelock=3.13.1=py311h06a4308_0
31
+ - freetype=2.12.1=h4a9f257_0
32
+ - fsspec=2024.6.1=py311h06a4308_0
33
+ - gcc_impl_linux-64=11.2.0=h1234567_1
34
+ - gcc_linux-64=11.2.0=h5c386dc_1
35
+ - giflib=5.2.2=h5eee18b_0
36
+ - gmp=6.2.1=h295c915_3
37
+ - gmpy2=2.1.2=py311hc9b5ff0_0
38
+ - gnutls=3.6.15=he1e5248_0
39
+ - gxx_impl_linux-64=11.2.0=h1234567_1
40
+ - gxx_linux-64=11.2.0=hc2dff05_1
41
+ - idna=3.7=py311h06a4308_0
42
+ - intel-openmp=2023.1.0=hdb19cb5_46306
43
+ - jinja2=3.1.4=py311h06a4308_1
44
+ - jpeg=9e=h5eee18b_3
45
+ - kernel-headers_linux-64=3.10.0=h57e8cba_10
46
+ - lame=3.100=h7b6447c_0
47
+ - lcms2=2.16=hb9589c4_0
48
+ - ld_impl_linux-64=2.40=h12ee557_0
49
+ - lerc=4.0.0=h6a678d5_0
50
+ - libabseil=20240116.2=cxx17_h6a678d5_0
51
+ - libcublas=12.4.5.8=0
52
+ - libcufft=11.2.1.3=0
53
+ - libcufile=1.9.1.3=0
54
+ - libcurand=10.3.5.147=0
55
+ - libcusolver=11.6.1.9=0
56
+ - libcusparse=12.3.1.170=0
57
+ - libdeflate=1.22=h5eee18b_0
58
+ - libffi=3.4.4=h6a678d5_1
59
+ - libgcc-devel_linux-64=11.2.0=h1234567_1
60
+ - libgcc-ng=11.2.0=h1234567_1
61
+ - libgomp=11.2.0=h1234567_1
62
+ - libiconv=1.16=h5eee18b_3
63
+ - libidn2=2.3.4=h5eee18b_0
64
+ - libjpeg-turbo=2.0.0=h9bf148f_0
65
+ - libnpp=12.2.5.30=0
66
+ - libnvfatbin=12.4.127=0
67
+ - libnvjitlink=12.4.127=0
68
+ - libnvjpeg=12.3.1.117=0
69
+ - libpng=1.6.39=h5eee18b_0
70
+ - libprotobuf=4.25.3=he621ea3_0
71
+ - libstdcxx-devel_linux-64=11.2.0=h1234567_1
72
+ - libstdcxx-ng=11.2.0=h1234567_1
73
+ - libtasn1=4.19.0=h5eee18b_0
74
+ - libtiff=4.5.1=hffd6297_1
75
+ - libunistring=0.9.10=h27cfd23_0
76
+ - libuuid=1.41.5=h5eee18b_0
77
+ - libwebp=1.3.2=h11a3e52_0
78
+ - libwebp-base=1.3.2=h5eee18b_1
79
+ - llvm-openmp=14.0.6=h9e868ea_0
80
+ - lz4-c=1.9.4=h6a678d5_1
81
+ - markupsafe=2.1.3=py311h5eee18b_0
82
+ - mkl=2023.1.0=h213fc3f_46344
83
+ - mkl-service=2.4.0=py311h5eee18b_1
84
+ - mkl_fft=1.3.11=py311h5eee18b_0
85
+ - mkl_random=1.2.8=py311ha02d727_0
86
+ - mpc=1.1.0=h10f8cd9_1
87
+ - mpfr=4.0.2=hb69a4c5_1
88
+ - mpmath=1.3.0=py311h06a4308_0
89
+ - ncurses=6.4=h6a678d5_0
90
+ - nettle=3.7.3=hbbd107a_1
91
+ - networkx=3.2.1=py311h06a4308_0
92
+ - numpy=2.0.1=py311h08b1b3b_1
93
+ - numpy-base=2.0.1=py311hf175353_1
94
+ - openh264=2.1.1=h4ff587b_0
95
+ - openjpeg=2.5.2=he7f1fd0_0
96
+ - openssl=3.0.15=h5eee18b_0
97
+ - pillow=11.0.0=py311hcea889d_1
98
+ - pip=24.2=py311h06a4308_0
99
+ - pysocks=1.7.1=py311h06a4308_0
100
+ - python=3.11.11=he870216_0
101
+ - pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0
102
+ - pytorch-cuda=12.4=hc786d27_7
103
+ - pytorch-mutex=1.0=cuda
104
+ - pyyaml=6.0.2=py311h5eee18b_0
105
+ - readline=8.2=h5eee18b_0
106
+ - requests=2.32.3=py311h06a4308_1
107
+ - setuptools=75.1.0=py311h06a4308_0
108
+ - sqlite=3.45.3=h5eee18b_0
109
+ - sysroot_linux-64=2.17=h57e8cba_10
110
+ - tbb=2021.8.0=hdb19cb5_0
111
+ - tk=8.6.14=h39e8969_0
112
+ - torchaudio=2.5.1=py311_cu124
113
+ - torchtriton=3.1.0=py311
114
+ - torchvision=0.20.1=py311_cu124
115
+ - typing_extensions=4.12.2=py311h06a4308_0
116
+ - urllib3=2.2.3=py311h06a4308_0
117
+ - wheel=0.44.0=py311h06a4308_0
118
+ - xz=5.4.6=h5eee18b_1
119
+ - yaml=0.2.5=h7b6447c_0
120
+ - zlib=1.2.13=h5eee18b_1
121
+ - zstd=1.5.6=hc292b87_0
122
+ - pip:
123
+ - absl-py==2.1.0
124
+ - accelerate==1.2.1
125
+ - aiohappyeyeballs==2.4.4
126
+ - aiohttp==3.11.11
127
+ - aiosignal==1.3.2
128
+ - annotated-types==0.7.0
129
+ - attrs==24.3.0
130
+ - datasets==3.2.0
131
+ - deepspeed==0.16.2
132
+ - dill==0.3.8
133
+ - einops==0.8.0
134
+ - flash-attn==2.7.2.post1
135
+ - frozenlist==1.5.0
136
+ - grpcio==1.70.0
137
+ - hjson==3.1.0
138
+ - huggingface-hub==0.27.0
139
+ - joblib==1.4.2
140
+ - markdown==3.7
141
+ - markdown-it-py==3.0.0
142
+ - mdurl==0.1.2
143
+ - mpi4py==4.0.1
144
+ - msgpack==1.1.0
145
+ - multidict==6.1.0
146
+ - multiprocess==0.70.16
147
+ - natsort==8.4.0
148
+ - ninja==1.11.1.3
149
+ - nvidia-ml-py==12.560.30
150
+ - packaging==24.2
151
+ - pandas==2.2.3
152
+ - peft==0.14.0
153
+ - propcache==0.2.1
154
+ - protobuf==6.30.0
155
+ - psutil==6.1.1
156
+ - py-cpuinfo==9.0.0
157
+ - pyarrow==18.1.0
158
+ - pydantic==2.10.4
159
+ - pydantic-core==2.27.2
160
+ - pygments==2.18.0
161
+ - python-dateutil==2.9.0.post0
162
+ - pytz==2024.2
163
+ - regex==2024.11.6
164
+ - rich==13.9.4
165
+ - safetensors==0.4.5
166
+ - scikit-learn==1.6.1
167
+ - scipy==1.15.2
168
+ - seqeval==1.2.2
169
+ - six==1.17.0
170
+ - sympy==1.13.1
171
+ - tensorboard==2.19.0
172
+ - tensorboard-data-server==0.7.2
173
+ - threadpoolctl==3.5.0
174
+ - tokenizers==0.21.0
175
+ - tqdm==4.67.1
176
+ - transformers==4.47.1
177
+ - trl==0.12.0
178
+ - tzdata==2024.2
179
+ - werkzeug==3.1.3
180
+ - xxhash==3.5.0
181
+ - yarl==1.18.3
182
+
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: SinaTools
3
- Version: 0.1.38
3
+ Version: 0.1.40
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -13,12 +13,17 @@ Requires-Dist: farasapy
13
13
  Requires-Dist: tqdm
14
14
  Requires-Dist: requests
15
15
  Requires-Dist: pathlib
16
- Requires-Dist: torch ==1.13.0
17
- Requires-Dist: transformers ==4.24.0
18
- Requires-Dist: torchtext ==0.14.0
19
- Requires-Dist: torchvision ==0.14.0
20
- Requires-Dist: seqeval ==1.2.2
21
- Requires-Dist: natsort ==7.1.1
16
+ Requires-Dist: transformers==4.47.1
17
+ Requires-Dist: torchvision==0.20.1
18
+ Requires-Dist: seqeval==1.2.2
19
+ Requires-Dist: natsort==7.1.1
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: keywords
24
+ Dynamic: license
25
+ Dynamic: requires-dist
26
+ Dynamic: summary
22
27
 
23
28
  SinaTools
24
29
  ======================
@@ -1,7 +1,7 @@
1
- SinaTools-0.1.38.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
- sinatools/VERSION,sha256=IG8zXDtajZ6W0rgxySeHulP0aoaEpnkET2yOuT5wRks,6
1
+ SinaTools-0.1.40.data/data/sinatools/environment.yml,sha256=i0UFZc-vwU9ZwnI8hBdz7vi-x22vG-HR8ojWBUAOkno,5422
2
+ sinatools/VERSION,sha256=X8RiPEX_AmrlLAnj8YcBMQpdMfq9ZIPos6V7xEw6f8o,6
3
3
  sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
4
- sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
4
+ sinatools/environment.yml,sha256=i0UFZc-vwU9ZwnI8hBdz7vi-x22vG-HR8ojWBUAOkno,5422
5
5
  sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
6
6
  sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
7
7
  sinatools/CLI/DataDownload/download_files.py,sha256=EezvbukR3pZ8s6mGZnzTcjsbo3CBDlC0g6KhJWlYp1w,2686
@@ -76,21 +76,21 @@ sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB
76
76
  sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma4WUm-okDps,1346
77
77
  sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
78
78
  sinatools/morphology/morph_analyzer.py,sha256=JOH2UWKNQWo5UzpWNzP9R1D3B3qLSogIiMp8n0N_56o,7177
79
- sinatools/ner/__init__.py,sha256=59kLMX6UQhF6JpE10RhaDYC3a2_jiWOIVPuejsoflFE,1050
80
- sinatools/ner/data_format.py,sha256=7Yt0aOicOn9_YuuyCkM_IYi_rgjGYxR9bCuUaNGM73o,4341
79
+ sinatools/ner/__init__.py,sha256=bBXwAShP9vVvRD1WsmJtSOueboxRGJftYdKu24cMS8E,1181
80
+ sinatools/ner/data_format.py,sha256=VmFshZbEPOsWxsb4tgSkwvbM1k7yCce4kmtPkCiWgwM,4513
81
81
  sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
82
82
  sinatools/ner/entity_extractor.py,sha256=O2epRwRFUUcQs3SnFIYHVBI4zVhr8hRcj0XJYeby4ts,3588
83
- sinatools/ner/helpers.py,sha256=dnOoDY5JMyOLTUWVIZLMt8mBn2IbWlVaqHhQyjs1voo,2343
83
+ sinatools/ner/helpers.py,sha256=sX6ezVbuVQxk_xJqZwhUzJVFVuVmFGmei_kd6r3sPHE,3652
84
84
  sinatools/ner/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
85
85
  sinatools/ner/transforms.py,sha256=vti3mDdi-IRP8i0aTQ37QqpPlP9hdMmJ6_bAMa0uL-s,4871
86
86
  sinatools/ner/data/__init__.py,sha256=W0C1ge_XxTfmdEGz0hkclz57aLI5VFS5t6BjByCfkFk,57
87
- sinatools/ner/data/datasets.py,sha256=lcdDDenFMEKIGYQmfww2dk_9WKWrJO9HtKptaAEsRmY,5064
87
+ sinatools/ner/data/datasets.py,sha256=_uUlvBAhnTtPwKLj0wIbmB04VCBidfwffxKorLGHq_g,5134
88
88
  sinatools/ner/data/transforms.py,sha256=URMz1dHzkHjgUGAkDOenCWvQThO1ha8XeQVjoLL9RXM,4874
89
89
  sinatools/ner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0,608
90
90
  sinatools/ner/nn/BertNestedTagger.py,sha256=_fwAn1kiKmXe6m5y16Ipty3kvXIEFEmiUq74Ad1818U,1219
91
91
  sinatools/ner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
92
92
  sinatools/ner/nn/__init__.py,sha256=UgQD_XLNzQGBNSYc_Bw1aRJZjq4PJsnMT1iZwnJemqE,170
93
- sinatools/ner/trainers/BaseTrainer.py,sha256=Ifz4SeTxJwVn1_uWZ3I9KbcSo2hLPN3ojsIYuoKE9wE,4050
93
+ sinatools/ner/trainers/BaseTrainer.py,sha256=Uar8HxtgBXCVhKa85sEN622d9P7JiFBcWfs46uRG4aA,4068
94
94
  sinatools/ner/trainers/BertNestedTrainer.py,sha256=iJOah69tXZsAXBimqP0odEsk8SPX4A355riePzW2BFs,8632
95
95
  sinatools/ner/trainers/BertTrainer.py,sha256=BtttsrHPolmK3eRDqrgVUuv6lVMuImIeskxhi02Q-44,6596
96
96
  sinatools/ner/trainers/__init__.py,sha256=Xnbi_M4KKJRqV7FJe1vklyT0nEW2Q2obxgcWkbR0ZbA,190
@@ -114,10 +114,10 @@ sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
114
114
  sinatools/wsd/disambiguator.py,sha256=h-3idc5rPPbMDSE_QVJAsEVkDHwzYY3L2SEPNXIdOcc,20104
115
115
  sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
116
116
  sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
117
- SinaTools-0.1.38.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
118
- SinaTools-0.1.38.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
119
- SinaTools-0.1.38.dist-info/METADATA,sha256=sMasvTcuV4-3WpBTyGKHkm9nTFfXuZkf4uXTHDh5_I8,3324
120
- SinaTools-0.1.38.dist-info/WHEEL,sha256=DZajD4pwLWue70CAfc7YaxT1wLUciNBvN_TTcvXpltE,110
121
- SinaTools-0.1.38.dist-info/entry_points.txt,sha256=_CsRKM_tSCWV5hefBNUsWf9_6DrJnzFlxeAo1wm5XqY,1302
122
- SinaTools-0.1.38.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
123
- SinaTools-0.1.38.dist-info/RECORD,,
117
+ SinaTools-0.1.40.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
118
+ SinaTools-0.1.40.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
119
+ SinaTools-0.1.40.dist-info/METADATA,sha256=woJwOAHIJuAlctnXXt8oiYGoDDcV42OrjmWOE3Q6ycI,3410
120
+ SinaTools-0.1.40.dist-info/WHEEL,sha256=9Hm2OB-j1QcCUq9Jguht7ayGIIZBRTdOXD1qg9cCgPM,109
121
+ SinaTools-0.1.40.dist-info/entry_points.txt,sha256=_CsRKM_tSCWV5hefBNUsWf9_6DrJnzFlxeAo1wm5XqY,1302
122
+ SinaTools-0.1.40.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
123
+ SinaTools-0.1.40.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any
sinatools/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.38
1
+ 0.1.40
sinatools/environment.yml CHANGED
@@ -1,227 +1,182 @@
1
- name: arabicner
1
+ name: dev
2
2
  channels:
3
- - anaconda
4
3
  - pytorch
5
4
  - nvidia
6
- - conda-forge
7
5
  - defaults
6
+ - https://repo.anaconda.com/pkgs/main
7
+ - https://repo.anaconda.com/pkgs/r
8
8
  dependencies:
9
9
  - _libgcc_mutex=0.1=main
10
10
  - _openmp_mutex=5.1=1_gnu
11
- - abseil-cpp=20211102.0=h27087fc_1
12
- - absl-py=1.3.0=pyhd8ed1ab_0
13
- - aiohttp=3.8.1=py310h5764c6d_1
14
- - aiosignal=1.2.0=pyhd8ed1ab_0
15
- - arrow-cpp=8.0.0=py310h3098874_0
16
- - async-timeout=4.0.2=py310h06a4308_0
17
- - attrs=22.1.0=pyh71513ae_1
18
- - aws-c-common=0.4.57=he6710b0_1
19
- - aws-c-event-stream=0.1.6=h2531618_5
20
- - aws-checksums=0.1.9=he6710b0_0
21
- - aws-sdk-cpp=1.8.185=hce553d0_0
11
+ - _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
12
+ - binutils_impl_linux-64=2.40=h5293946_0
13
+ - binutils_linux-64=2.40.0=hc2dff05_1
22
14
  - blas=1.0=mkl
23
- - blinker=1.5=pyhd8ed1ab_0
24
- - boost-cpp=1.78.0=he72f1d9_0
25
- - bottleneck=1.3.5=py310ha9d4c09_0
26
- - brotli=1.0.9=h166bdaf_7
27
- - brotli-bin=1.0.9=h166bdaf_7
28
- - brotlipy=0.7.0=py310h7f8727e_1002
29
- - bzip2=1.0.8=h7b6447c_0
30
- - c-ares=1.18.1=h7f98852_0
31
- - ca-certificates=2022.9.24=ha878542_0
32
- - cachetools=5.2.0=pyhd8ed1ab_0
33
- - certifi=2022.9.24=pyhd8ed1ab_0
34
- - cffi=1.15.1=py310h74dc2b5_0
35
- - charset-normalizer=2.0.4=pyhd3eb1b0_0
36
- - click=8.1.3=unix_pyhd8ed1ab_2
37
- - cryptography=38.0.1=py310h9ce1e76_0
38
- - cuda=11.7.1=0
39
- - cuda-cccl=11.7.91=0
40
- - cuda-command-line-tools=11.7.1=0
41
- - cuda-compiler=11.7.1=0
42
- - cuda-cudart=11.7.99=0
43
- - cuda-cudart-dev=11.7.99=0
44
- - cuda-cuobjdump=11.7.91=0
45
- - cuda-cupti=11.7.101=0
46
- - cuda-cuxxfilt=11.7.91=0
47
- - cuda-demo-suite=11.8.86=0
48
- - cuda-documentation=11.8.86=0
49
- - cuda-driver-dev=11.7.99=0
50
- - cuda-gdb=11.8.86=0
51
- - cuda-libraries=11.7.1=0
52
- - cuda-libraries-dev=11.7.1=0
53
- - cuda-memcheck=11.8.86=0
54
- - cuda-nsight=11.8.86=0
55
- - cuda-nsight-compute=11.8.0=0
56
- - cuda-nvcc=11.7.99=0
57
- - cuda-nvdisasm=11.8.86=0
58
- - cuda-nvml-dev=11.7.91=0
59
- - cuda-nvprof=11.8.87=0
60
- - cuda-nvprune=11.7.91=0
61
- - cuda-nvrtc=11.7.99=0
62
- - cuda-nvrtc-dev=11.7.99=0
63
- - cuda-nvtx=11.7.91=0
64
- - cuda-nvvp=11.8.87=0
65
- - cuda-runtime=11.7.1=0
66
- - cuda-sanitizer-api=11.8.86=0
67
- - cuda-toolkit=11.7.1=0
68
- - cuda-tools=11.7.1=0
69
- - cuda-visual-tools=11.7.1=0
70
- - dataclasses=0.8=pyhc8e2a94_3
71
- - datasets=2.6.1=pyhd8ed1ab_0
72
- - dill=0.3.5.1=pyhd8ed1ab_0
15
+ - brotli-python=1.0.9=py311h6a678d5_8
16
+ - bzip2=1.0.8=h5eee18b_6
17
+ - ca-certificates=2024.11.26=h06a4308_0
18
+ - certifi=2024.12.14=py311h06a4308_0
19
+ - charset-normalizer=3.3.2=pyhd3eb1b0_0
20
+ - cuda-cudart=12.4.127=0
21
+ - cuda-cupti=12.4.127=0
22
+ - cuda-libraries=12.4.1=0
23
+ - cuda-nvrtc=12.4.127=0
24
+ - cuda-nvtx=12.4.127=0
25
+ - cuda-opencl=12.4.127=0
26
+ - cuda-runtime=12.4.1=0
27
+ - cuda-version=11.7=h6a555f7_3
28
+ - cudatoolkit=11.7.0=hd8887f6_10
73
29
  - ffmpeg=4.3=hf484d3e_0
74
- - fftw=3.3.10=nompi_h77c792f_102
75
- - filelock=3.8.0=pyhd8ed1ab_0
30
+ - filelock=3.13.1=py311h06a4308_0
76
31
  - freetype=2.12.1=h4a9f257_0
77
- - frozenlist=1.2.0=py310h7f8727e_1
78
- - fsspec=2022.10.0=pyhd8ed1ab_0
79
- - gds-tools=1.4.0.31=0
80
- - gflags=2.2.2=he1b5a44_1004
81
- - giflib=5.2.1=h7b6447c_0
82
- - glog=0.6.0=h6f12383_0
32
+ - fsspec=2024.6.1=py311h06a4308_0
33
+ - gcc_impl_linux-64=11.2.0=h1234567_1
34
+ - gcc_linux-64=11.2.0=h5c386dc_1
35
+ - giflib=5.2.2=h5eee18b_0
83
36
  - gmp=6.2.1=h295c915_3
37
+ - gmpy2=2.1.2=py311hc9b5ff0_0
84
38
  - gnutls=3.6.15=he1e5248_0
85
- #- google-auth=2.14.0=pyh1a96a4e_0
86
- #- google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
87
- - grpc-cpp=1.46.1=h33aed49_0
88
- - grpcio=1.42.0=py310hce63b2e_0
89
- - huggingface_hub=0.10.1=pyhd8ed1ab_0
90
- - icu=70.1=h27087fc_0
91
- - idna=3.4=py310h06a4308_0
92
- - importlib-metadata=5.0.0=pyha770c72_1
93
- - importlib_metadata=5.0.0=hd8ed1ab_1
94
- - intel-openmp=2021.4.0=h06a4308_3561
95
- - joblib=1.2.0=pyhd8ed1ab_0
96
- - jpeg=9e=h7f8727e_0
97
- - keyutils=1.6.1=h166bdaf_0
98
- - krb5=1.19.3=h3790be6_0
39
+ - gxx_impl_linux-64=11.2.0=h1234567_1
40
+ - gxx_linux-64=11.2.0=hc2dff05_1
41
+ - idna=3.7=py311h06a4308_0
42
+ - intel-openmp=2023.1.0=hdb19cb5_46306
43
+ - jinja2=3.1.4=py311h06a4308_1
44
+ - jpeg=9e=h5eee18b_3
45
+ - kernel-headers_linux-64=3.10.0=h57e8cba_10
99
46
  - lame=3.100=h7b6447c_0
100
- - lcms2=2.12=h3be6417_0
101
- - ld_impl_linux-64=2.38=h1181459_1
102
- - lerc=3.0=h295c915_0
103
- - libbrotlicommon=1.0.9=h166bdaf_7
104
- - libbrotlidec=1.0.9=h166bdaf_7
105
- - libbrotlienc=1.0.9=h166bdaf_7
106
- - libcublas=11.11.3.6=0
107
- - libcublas-dev=11.11.3.6=0
108
- - libcufft=10.9.0.58=0
109
- - libcufft-dev=10.9.0.58=0
110
- - libcufile=1.4.0.31=0
111
- - libcufile-dev=1.4.0.31=0
112
- - libcurand=10.3.0.86=0
113
- - libcurl=7.85.0=h91b91d3_0
114
- - libcusolver=11.4.1.48=0
115
- - libcusolver-dev=11.4.1.48=0
116
- - libcusparse=11.7.5.86=0
117
- - libcusparse-dev=11.7.5.86=0
118
- - libdeflate=1.8=h7f8727e_5
119
- - libedit=3.1.20191231=he28a2e2_2
120
- - libev=4.33=h516909a_1
121
- - libevent=2.1.10=h9b69904_4
122
- - libffi=3.3=he6710b0_2
47
+ - lcms2=2.16=hb9589c4_0
48
+ - ld_impl_linux-64=2.40=h12ee557_0
49
+ - lerc=4.0.0=h6a678d5_0
50
+ - libabseil=20240116.2=cxx17_h6a678d5_0
51
+ - libcublas=12.4.5.8=0
52
+ - libcufft=11.2.1.3=0
53
+ - libcufile=1.9.1.3=0
54
+ - libcurand=10.3.5.147=0
55
+ - libcusolver=11.6.1.9=0
56
+ - libcusparse=12.3.1.170=0
57
+ - libdeflate=1.22=h5eee18b_0
58
+ - libffi=3.4.4=h6a678d5_1
59
+ - libgcc-devel_linux-64=11.2.0=h1234567_1
123
60
  - libgcc-ng=11.2.0=h1234567_1
124
- - libgfortran-ng=12.2.0=h69a702a_19
125
- - libgfortran5=12.2.0=h337968e_19
126
61
  - libgomp=11.2.0=h1234567_1
127
- - libiconv=1.16=h7f8727e_2
128
- - libidn2=2.3.2=h7f8727e_0
129
- - libnghttp2=1.46.0=hce63b2e_0
130
- - libnpp=11.8.0.86=0
131
- - libnpp-dev=11.8.0.86=0
132
- - libnvjpeg=11.9.0.86=0
133
- - libnvjpeg-dev=11.9.0.86=0
134
- - libpng=1.6.37=hbc83047_0
135
- - libprotobuf=3.20.1=h4ff587b_0
136
- - libssh2=1.10.0=ha56f1ee_2
62
+ - libiconv=1.16=h5eee18b_3
63
+ - libidn2=2.3.4=h5eee18b_0
64
+ - libjpeg-turbo=2.0.0=h9bf148f_0
65
+ - libnpp=12.2.5.30=0
66
+ - libnvfatbin=12.4.127=0
67
+ - libnvjitlink=12.4.127=0
68
+ - libnvjpeg=12.3.1.117=0
69
+ - libpng=1.6.39=h5eee18b_0
70
+ - libprotobuf=4.25.3=he621ea3_0
71
+ - libstdcxx-devel_linux-64=11.2.0=h1234567_1
137
72
  - libstdcxx-ng=11.2.0=h1234567_1
138
- - libtasn1=4.16.0=h27cfd23_0
139
- - libthrift=0.15.0=he6d91bd_0
140
- - libtiff=4.4.0=hecacb30_0
73
+ - libtasn1=4.19.0=h5eee18b_0
74
+ - libtiff=4.5.1=hffd6297_1
141
75
  - libunistring=0.9.10=h27cfd23_0
142
- - libuuid=1.0.3=h7f8727e_2
143
- - libwebp=1.2.4=h11a3e52_0
144
- - libwebp-base=1.2.4=h5eee18b_0
145
- - lz4-c=1.9.3=h295c915_1
146
- - markdown=3.4.1=pyhd8ed1ab_0
147
- - markupsafe=2.1.1=py310h5764c6d_1
148
- - mkl=2021.4.0=h06a4308_640
149
- - mkl-service=2.4.0=py310h7f8727e_0
150
- - mkl_fft=1.3.1=py310hd6ae3a3_0
151
- - mkl_random=1.2.2=py310h00e6091_0
152
- - multidict=6.0.2=py310h5764c6d_1
153
- - multiprocess=0.70.12.2=py310h5764c6d_2
154
- - natsort=7.1.1=pyhd3eb1b0_0
155
- - ncurses=6.3=h5eee18b_3
76
+ - libuuid=1.41.5=h5eee18b_0
77
+ - libwebp=1.3.2=h11a3e52_0
78
+ - libwebp-base=1.3.2=h5eee18b_1
79
+ - llvm-openmp=14.0.6=h9e868ea_0
80
+ - lz4-c=1.9.4=h6a678d5_1
81
+ - markupsafe=2.1.3=py311h5eee18b_0
82
+ - mkl=2023.1.0=h213fc3f_46344
83
+ - mkl-service=2.4.0=py311h5eee18b_1
84
+ - mkl_fft=1.3.11=py311h5eee18b_0
85
+ - mkl_random=1.2.8=py311ha02d727_0
86
+ - mpc=1.1.0=h10f8cd9_1
87
+ - mpfr=4.0.2=hb69a4c5_1
88
+ - mpmath=1.3.0=py311h06a4308_0
89
+ - ncurses=6.4=h6a678d5_0
156
90
  - nettle=3.7.3=hbbd107a_1
157
- - nsight-compute=2022.3.0.22=0
158
- - numexpr=2.8.3=py310hcea2de6_0
159
- - numpy=1.23.3=py310hd5efca6_0
160
- #- numpy-base=1.23.3=py310h8e6c178_0
161
- - oauthlib=3.2.2=pyhd8ed1ab_0
91
+ - networkx=3.2.1=py311h06a4308_0
92
+ - numpy=2.0.1=py311h08b1b3b_1
93
+ - numpy-base=2.0.1=py311hf175353_1
162
94
  - openh264=2.1.1=h4ff587b_0
163
- - openssl=1.1.1s=h7f8727e_0
164
- - orc=1.7.4=h07ed6aa_0
165
- - packaging=21.3=pyhd8ed1ab_0
166
- - pandas=1.4.4=py310h6a678d5_0
167
- - pillow=9.2.0=py310hace64e9_1
168
- - pip=22.2.2=py310h06a4308_0
169
- - protobuf=3.20.1=py310hd8f1fbe_0
170
- - pyarrow=8.0.0=py310h468efa6_0
171
- - pyasn1=0.4.8=py_0
172
- - pyasn1-modules=0.2.7=py_0
173
- - pycparser=2.21=pyhd3eb1b0_0
174
- - pyjwt=2.6.0=pyhd8ed1ab_0
175
- - pyopenssl=22.0.0=pyhd3eb1b0_0
176
- - pyparsing=3.0.9=pyhd8ed1ab_0
177
- - pysocks=1.7.1=py310h06a4308_0
178
- - python=3.10.6=haa1d7c7_1
179
- - python-dateutil=2.8.2=pyhd8ed1ab_0
180
- - python-xxhash=3.0.0=py310h5764c6d_1
181
- - python_abi=3.10=2_cp310
182
- - pytorch=1.13.0=py3.10_cuda11.7_cudnn8.5.0_0
183
- - pytorch-cuda=11.7=h67b0de4_0
95
+ - openjpeg=2.5.2=he7f1fd0_0
96
+ - openssl=3.0.15=h5eee18b_0
97
+ - pillow=11.0.0=py311hcea889d_1
98
+ - pip=24.2=py311h06a4308_0
99
+ - pysocks=1.7.1=py311h06a4308_0
100
+ - python=3.11.11=he870216_0
101
+ - pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0
102
+ - pytorch-cuda=12.4=hc786d27_7
184
103
  - pytorch-mutex=1.0=cuda
185
- - pytz=2022.6=pyhd8ed1ab_0
186
- - pyu2f=0.1.5=pyhd8ed1ab_0
187
- - pyyaml=6.0=py310h5764c6d_4
188
- - re2=2022.04.01=h27087fc_0
104
+ - pyyaml=6.0.2=py311h5eee18b_0
189
105
  - readline=8.2=h5eee18b_0
190
- - regex=2022.7.9=py310h5eee18b_0
191
- - requests=2.28.1=py310h06a4308_0
192
- - requests-oauthlib=1.3.1=pyhd8ed1ab_0
193
- - responses=0.18.0=pyhd8ed1ab_0
194
- - rsa=4.9=pyhd8ed1ab_0
195
- - sacremoses=0.0.53=pyhd8ed1ab_0
196
- - scikit-learn=1.1.3=py310h6a678d5_0
197
- - scipy=1.9.3=py310hd5efca6_0
198
- - seqeval=1.2.2=pyhd3deb0d_0
199
- - setuptools=65.4.0=py310h06a4308_0
200
- - six=1.16.0=pyhd3eb1b0_1
201
- - snappy=1.1.9=hbd366e4_1
202
- - sqlite=3.39.3=h5082296_0
203
- - tensorboard=2.10.1=pyhd8ed1ab_0
204
- - tensorboard-data-server=0.6.0=py310h597c629_2
205
- - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
206
- - threadpoolctl=3.1.0=pyh8a188c0_0
207
- - tk=8.6.12=h1ccaba5_0
208
- - tokenizers=0.11.4=py310h3dcd8bd_1
209
- - torchaudio=0.13.0=py310_cu117
210
- - torchtext=0.14.0=py310
211
- - torchvision=0.14.0=py310_cu117
212
- - tqdm=4.64.1=py310h06a4308_0
213
- - transformers=4.24.0=pyhd8ed1ab_0
214
- - typing-extensions=4.3.0=py310h06a4308_0
215
- - typing_extensions=4.3.0=py310h06a4308_0
216
- - tzdata=2022e=h04d1e81_0
217
- - urllib3=1.26.12=py310h06a4308_0
218
- - utf8proc=2.6.1=h27cfd23_0
219
- - werkzeug=2.2.2=pyhd8ed1ab_0
220
- - wheel=0.37.1=pyhd3eb1b0_0
221
- - xxhash=0.8.0=h7f98852_3
222
- - xz=5.2.6=h5eee18b_0
223
- - yaml=0.2.5=h7f98852_2
224
- - yarl=1.7.2=py310h5764c6d_2
225
- - zipp=3.10.0=pyhd8ed1ab_0
226
- - zlib=1.2.13=h5eee18b_0
227
- - zstd=1.5.2=ha4553b6_0
106
+ - requests=2.32.3=py311h06a4308_1
107
+ - setuptools=75.1.0=py311h06a4308_0
108
+ - sqlite=3.45.3=h5eee18b_0
109
+ - sysroot_linux-64=2.17=h57e8cba_10
110
+ - tbb=2021.8.0=hdb19cb5_0
111
+ - tk=8.6.14=h39e8969_0
112
+ - torchaudio=2.5.1=py311_cu124
113
+ - torchtriton=3.1.0=py311
114
+ - torchvision=0.20.1=py311_cu124
115
+ - typing_extensions=4.12.2=py311h06a4308_0
116
+ - urllib3=2.2.3=py311h06a4308_0
117
+ - wheel=0.44.0=py311h06a4308_0
118
+ - xz=5.4.6=h5eee18b_1
119
+ - yaml=0.2.5=h7b6447c_0
120
+ - zlib=1.2.13=h5eee18b_1
121
+ - zstd=1.5.6=hc292b87_0
122
+ - pip:
123
+ - absl-py==2.1.0
124
+ - accelerate==1.2.1
125
+ - aiohappyeyeballs==2.4.4
126
+ - aiohttp==3.11.11
127
+ - aiosignal==1.3.2
128
+ - annotated-types==0.7.0
129
+ - attrs==24.3.0
130
+ - datasets==3.2.0
131
+ - deepspeed==0.16.2
132
+ - dill==0.3.8
133
+ - einops==0.8.0
134
+ - flash-attn==2.7.2.post1
135
+ - frozenlist==1.5.0
136
+ - grpcio==1.70.0
137
+ - hjson==3.1.0
138
+ - huggingface-hub==0.27.0
139
+ - joblib==1.4.2
140
+ - markdown==3.7
141
+ - markdown-it-py==3.0.0
142
+ - mdurl==0.1.2
143
+ - mpi4py==4.0.1
144
+ - msgpack==1.1.0
145
+ - multidict==6.1.0
146
+ - multiprocess==0.70.16
147
+ - natsort==8.4.0
148
+ - ninja==1.11.1.3
149
+ - nvidia-ml-py==12.560.30
150
+ - packaging==24.2
151
+ - pandas==2.2.3
152
+ - peft==0.14.0
153
+ - propcache==0.2.1
154
+ - protobuf==6.30.0
155
+ - psutil==6.1.1
156
+ - py-cpuinfo==9.0.0
157
+ - pyarrow==18.1.0
158
+ - pydantic==2.10.4
159
+ - pydantic-core==2.27.2
160
+ - pygments==2.18.0
161
+ - python-dateutil==2.9.0.post0
162
+ - pytz==2024.2
163
+ - regex==2024.11.6
164
+ - rich==13.9.4
165
+ - safetensors==0.4.5
166
+ - scikit-learn==1.6.1
167
+ - scipy==1.15.2
168
+ - seqeval==1.2.2
169
+ - six==1.17.0
170
+ - sympy==1.13.1
171
+ - tensorboard==2.19.0
172
+ - tensorboard-data-server==0.7.2
173
+ - threadpoolctl==3.5.0
174
+ - tokenizers==0.21.0
175
+ - tqdm==4.67.1
176
+ - transformers==4.47.1
177
+ - trl==0.12.0
178
+ - tzdata==2024.2
179
+ - werkzeug==3.1.3
180
+ - xxhash==3.5.0
181
+ - yarl==1.18.3
182
+
sinatools/ner/__init__.py CHANGED
@@ -11,7 +11,7 @@ from argparse import Namespace
11
11
  tagger = None
12
12
  tag_vocab = None
13
13
  train_config = None
14
-
14
+ print("ner started")
15
15
  filename = 'Wj27012000.tar'
16
16
  path =downloader.get_appdatadir()
17
17
  model_path = os.path.join(path, filename)
@@ -20,19 +20,21 @@ _path = os.path.join(model_path, "tag_vocab.pkl")
20
20
 
21
21
  with open(_path, "rb") as fh:
22
22
  tag_vocab = pickle.load(fh)
23
+ print("tag_vocab loaded")
23
24
 
24
25
  train_config = Namespace()
25
26
  args_path = os.path.join(model_path, "args.json")
26
-
27
+ print("args loaded")
27
28
  with open(args_path, "r") as fh:
28
29
  train_config.__dict__ = json.load(fh)
29
-
30
+ print("steps 1")
30
31
  model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
31
32
  model = torch.nn.DataParallel(model)
32
-
33
+ print("steps 2")
33
34
  if torch.cuda.is_available():
34
35
  model = model.cuda()
35
-
36
+ print("steps 3")
36
37
  train_config.trainer_config["kwargs"]["model"] = model
37
38
  tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
38
39
  tagger.load(os.path.join(model_path,"checkpoints"))
40
+ print("steps 4")
@@ -37,7 +37,11 @@ class Token:
37
37
  :return: str
38
38
  """
39
39
  gold_tags = "|".join(self.gold_tag)
40
- pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
40
+
41
+ if self.pred_tag:
42
+ pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
43
+ else:
44
+ pred_tags = ""
41
45
 
42
46
  if self.gold_tag:
43
47
  r = f"{self.text}\t{gold_tags}\t{pred_tags}"
@@ -139,8 +143,8 @@ class NestedTagsDataset(Dataset):
139
143
  masks = torch.cat(masks)
140
144
 
141
145
  # Pad the tags, do the padding for each tag type
142
- tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["<pad>"])(tag)
146
+ tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["O"])(tag)
143
147
  for tag, vocab in zip(tags, self.vocab.tags[1:])]
144
148
  tags = torch.cat(tags)
145
149
 
146
- return subwords, tags, tokens, masks, valid_len
150
+ return subwords, tags, tokens, masks, valid_len
@@ -1,16 +1,30 @@
1
1
  from torch.utils.data import DataLoader
2
- from torchtext.vocab import vocab
3
2
  from collections import Counter, namedtuple
4
3
  import logging
5
4
  import re
6
5
  import itertools
7
6
  from sinatools.ner.helpers import load_object
8
- from sinatools.ner.datasets import Token
9
- from sinatools.utils.tokenizers_words import simple_word_tokenize
7
+ from sinatools.ner.data.datasets import Token
10
8
 
11
9
  logger = logging.getLogger(__name__)
12
10
 
13
11
 
12
+ class Vocab:
13
+ def __init__(self, counter, specials=[]) -> None:
14
+ self.itos = list(counter.keys()) + specials
15
+ self.stoi = {s: i for i, s in enumerate(self.itos)}
16
+ self.word_count = counter
17
+
18
+ def get_itos(self) -> list[str]:
19
+ return self.itos
20
+
21
+ def get_stoi(self) -> dict[str, int]:
22
+ return self.stoi
23
+
24
+ def __len__(self):
25
+ return len(self.itos)
26
+
27
+
14
28
  def conll_to_segments(filename):
15
29
  """
16
30
  Convert CoNLL files to segments. This return list of segments and each segment is
@@ -60,8 +74,8 @@ def parse_conll_files(data_paths):
60
74
 
61
75
  # Generate vocabs for tags and tokens
62
76
  tag_vocabs = tag_vocab_by_type(tags)
63
- tag_vocabs.insert(0, vocab(Counter(tags)))
64
- vocabs = vocabs(tokens=vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
77
+ tag_vocabs.insert(0, Vocab(Counter(tags)))
78
+ vocabs = vocabs(tokens=Vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
65
79
  return tuple(datasets), vocabs
66
80
 
67
81
 
@@ -72,9 +86,9 @@ def tag_vocab_by_type(tags):
72
86
  tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
73
87
 
74
88
  for tag_type in tag_types:
75
- r = re.compile(".*-" + tag_type)
89
+ r = re.compile(".*-" + tag_type + "$")
76
90
  t = list(filter(r.match, tags)) + ["O"]
77
- vocabs.append(vocab(Counter(t), specials=["<pad>"]))
91
+ vocabs.append(Vocab(Counter(t)))
78
92
 
79
93
  return vocabs
80
94
 
@@ -83,13 +97,11 @@ def text2segments(text):
83
97
  """
84
98
  Convert text to a datasets and index the tokens
85
99
  """
86
- #dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
87
- list_of_tokens = simple_word_tokenize(text)
88
- dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
100
+ dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
89
101
  tokens = [token.text for segment in dataset for token in segment]
90
102
 
91
103
  # Generate vocabs for the tokens
92
- segment_vocab = vocab(Counter(tokens), specials=["UNK"])
104
+ segment_vocab = Vocab(Counter(tokens), specials=["UNK"])
93
105
  return dataset, segment_vocab
94
106
 
95
107
 
@@ -121,4 +133,4 @@ def get_dataloaders(
121
133
  logger.info("%s batches found", len(dataloader))
122
134
  dataloaders.append(dataloader)
123
135
 
124
- return dataloaders
136
+ return dataloaders
sinatools/ner/helpers.py CHANGED
@@ -4,8 +4,11 @@ import logging
4
4
  import importlib
5
5
  import shutil
6
6
  import torch
7
+ import pickle
8
+ import json
7
9
  import random
8
10
  import numpy as np
11
+ from argparse import Namespace
9
12
 
10
13
 
11
14
  def logging_config(log_file=None):
@@ -30,24 +33,17 @@ def logging_config(log_file=None):
30
33
 
31
34
 
32
35
  def load_object(name, kwargs):
36
+ """
37
+ Load objects dynamically given the object name and its arguments
38
+ :param name: str - object name, class name or function name
39
+ :param kwargs: dict - keyword arguments
40
+ :return: object
41
+ """
42
+ object_module, object_name = name.rsplit(".", 1)
43
+ object_module = importlib.import_module(object_module)
44
+ fn = getattr(object_module, object_name)(**kwargs)
45
+ return fn
33
46
 
34
- try:
35
- object_module, object_name = name.rsplit(".", 1)
36
- object_module = importlib.import_module(object_module)
37
- obj = getattr(object_module, object_name)
38
- if callable(obj):
39
- fn = obj(**kwargs)
40
- return fn
41
- else:
42
- raise TypeError(f"{name} is not a callable object.")
43
- except (ImportError, ModuleNotFoundError) as e:
44
- print(f"Error importing module: {e}")
45
- except AttributeError as e:
46
- print(f"Attribute error: {e}")
47
- except Exception as e:
48
- print(f"An error occurred: {e}")
49
-
50
- return None
51
47
 
52
48
  def make_output_dirs(path, subdirs=[], overwrite=True):
53
49
  """
@@ -66,6 +62,41 @@ def make_output_dirs(path, subdirs=[], overwrite=True):
66
62
  os.makedirs(os.path.join(path, subdir))
67
63
 
68
64
 
65
+ def load_checkpoint(model_path):
66
+ """
67
+ Load model given the model path
68
+ :param model_path: str - path to model
69
+ :return: tagger - arabiner.trainers.BaseTrainer - the tagger model
70
+ vocab - arabicner.utils.data.Vocab - indexed tags
71
+ train_config - argparse.Namespace - training configurations
72
+ """
73
+ with open(os.path.join(model_path, "tag_vocab.pkl"), "rb") as fh:
74
+ tag_vocab = pickle.load(fh)
75
+
76
+ # Load train configurations from checkpoint
77
+ train_config = Namespace()
78
+ with open(os.path.join(model_path, "args.json"), "r") as fh:
79
+ train_config.__dict__ = json.load(fh)
80
+
81
+ # Initialize the loss function, not used for inference, but evaluation
82
+ loss = load_object(train_config.loss["fn"], train_config.loss["kwargs"])
83
+
84
+ # Load BERT tagger
85
+ model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
86
+ model = torch.nn.DataParallel(model)
87
+
88
+ if torch.cuda.is_available():
89
+ model = model.cuda()
90
+
91
+ # Update arguments for the tagger
92
+ # Attach the model, loss (used for evaluations cases)
93
+ train_config.trainer_config["kwargs"]["model"] = model
94
+ train_config.trainer_config["kwargs"]["loss"] = loss
95
+
96
+ tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
97
+ tagger.load(os.path.join(model_path, "checkpoints"))
98
+ return tagger, tag_vocab, train_config
99
+
69
100
 
70
101
  def set_seed(seed):
71
102
  """
@@ -83,4 +114,4 @@ def set_seed(seed):
83
114
 
84
115
  torch.backends.cudnn.deterministic = True
85
116
  torch.backends.cudnn.benchmark = False
86
- torch.backends.cudnn.enabled = False
117
+ torch.backends.cudnn.enabled = False
@@ -113,5 +113,5 @@ class BaseTrainer:
113
113
  logger.info("Loading checkpoint %s", checkpoint_path)
114
114
 
115
115
  device = None if torch.cuda.is_available() else torch.device('cpu')
116
- checkpoint = torch.load(checkpoint_path, map_location=device)
117
- self.model.load_state_dict(checkpoint["model"], strict=False)
116
+ checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
117
+ self.model.load_state_dict(checkpoint["model"], strict=False)
@@ -1,227 +0,0 @@
1
- name: arabicner
2
- channels:
3
- - anaconda
4
- - pytorch
5
- - nvidia
6
- - conda-forge
7
- - defaults
8
- dependencies:
9
- - _libgcc_mutex=0.1=main
10
- - _openmp_mutex=5.1=1_gnu
11
- - abseil-cpp=20211102.0=h27087fc_1
12
- - absl-py=1.3.0=pyhd8ed1ab_0
13
- - aiohttp=3.8.1=py310h5764c6d_1
14
- - aiosignal=1.2.0=pyhd8ed1ab_0
15
- - arrow-cpp=8.0.0=py310h3098874_0
16
- - async-timeout=4.0.2=py310h06a4308_0
17
- - attrs=22.1.0=pyh71513ae_1
18
- - aws-c-common=0.4.57=he6710b0_1
19
- - aws-c-event-stream=0.1.6=h2531618_5
20
- - aws-checksums=0.1.9=he6710b0_0
21
- - aws-sdk-cpp=1.8.185=hce553d0_0
22
- - blas=1.0=mkl
23
- - blinker=1.5=pyhd8ed1ab_0
24
- - boost-cpp=1.78.0=he72f1d9_0
25
- - bottleneck=1.3.5=py310ha9d4c09_0
26
- - brotli=1.0.9=h166bdaf_7
27
- - brotli-bin=1.0.9=h166bdaf_7
28
- - brotlipy=0.7.0=py310h7f8727e_1002
29
- - bzip2=1.0.8=h7b6447c_0
30
- - c-ares=1.18.1=h7f98852_0
31
- - ca-certificates=2022.9.24=ha878542_0
32
- - cachetools=5.2.0=pyhd8ed1ab_0
33
- - certifi=2022.9.24=pyhd8ed1ab_0
34
- - cffi=1.15.1=py310h74dc2b5_0
35
- - charset-normalizer=2.0.4=pyhd3eb1b0_0
36
- - click=8.1.3=unix_pyhd8ed1ab_2
37
- - cryptography=38.0.1=py310h9ce1e76_0
38
- - cuda=11.7.1=0
39
- - cuda-cccl=11.7.91=0
40
- - cuda-command-line-tools=11.7.1=0
41
- - cuda-compiler=11.7.1=0
42
- - cuda-cudart=11.7.99=0
43
- - cuda-cudart-dev=11.7.99=0
44
- - cuda-cuobjdump=11.7.91=0
45
- - cuda-cupti=11.7.101=0
46
- - cuda-cuxxfilt=11.7.91=0
47
- - cuda-demo-suite=11.8.86=0
48
- - cuda-documentation=11.8.86=0
49
- - cuda-driver-dev=11.7.99=0
50
- - cuda-gdb=11.8.86=0
51
- - cuda-libraries=11.7.1=0
52
- - cuda-libraries-dev=11.7.1=0
53
- - cuda-memcheck=11.8.86=0
54
- - cuda-nsight=11.8.86=0
55
- - cuda-nsight-compute=11.8.0=0
56
- - cuda-nvcc=11.7.99=0
57
- - cuda-nvdisasm=11.8.86=0
58
- - cuda-nvml-dev=11.7.91=0
59
- - cuda-nvprof=11.8.87=0
60
- - cuda-nvprune=11.7.91=0
61
- - cuda-nvrtc=11.7.99=0
62
- - cuda-nvrtc-dev=11.7.99=0
63
- - cuda-nvtx=11.7.91=0
64
- - cuda-nvvp=11.8.87=0
65
- - cuda-runtime=11.7.1=0
66
- - cuda-sanitizer-api=11.8.86=0
67
- - cuda-toolkit=11.7.1=0
68
- - cuda-tools=11.7.1=0
69
- - cuda-visual-tools=11.7.1=0
70
- - dataclasses=0.8=pyhc8e2a94_3
71
- - datasets=2.6.1=pyhd8ed1ab_0
72
- - dill=0.3.5.1=pyhd8ed1ab_0
73
- - ffmpeg=4.3=hf484d3e_0
74
- - fftw=3.3.10=nompi_h77c792f_102
75
- - filelock=3.8.0=pyhd8ed1ab_0
76
- - freetype=2.12.1=h4a9f257_0
77
- - frozenlist=1.2.0=py310h7f8727e_1
78
- - fsspec=2022.10.0=pyhd8ed1ab_0
79
- - gds-tools=1.4.0.31=0
80
- - gflags=2.2.2=he1b5a44_1004
81
- - giflib=5.2.1=h7b6447c_0
82
- - glog=0.6.0=h6f12383_0
83
- - gmp=6.2.1=h295c915_3
84
- - gnutls=3.6.15=he1e5248_0
85
- #- google-auth=2.14.0=pyh1a96a4e_0
86
- #- google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
87
- - grpc-cpp=1.46.1=h33aed49_0
88
- - grpcio=1.42.0=py310hce63b2e_0
89
- - huggingface_hub=0.10.1=pyhd8ed1ab_0
90
- - icu=70.1=h27087fc_0
91
- - idna=3.4=py310h06a4308_0
92
- - importlib-metadata=5.0.0=pyha770c72_1
93
- - importlib_metadata=5.0.0=hd8ed1ab_1
94
- - intel-openmp=2021.4.0=h06a4308_3561
95
- - joblib=1.2.0=pyhd8ed1ab_0
96
- - jpeg=9e=h7f8727e_0
97
- - keyutils=1.6.1=h166bdaf_0
98
- - krb5=1.19.3=h3790be6_0
99
- - lame=3.100=h7b6447c_0
100
- - lcms2=2.12=h3be6417_0
101
- - ld_impl_linux-64=2.38=h1181459_1
102
- - lerc=3.0=h295c915_0
103
- - libbrotlicommon=1.0.9=h166bdaf_7
104
- - libbrotlidec=1.0.9=h166bdaf_7
105
- - libbrotlienc=1.0.9=h166bdaf_7
106
- - libcublas=11.11.3.6=0
107
- - libcublas-dev=11.11.3.6=0
108
- - libcufft=10.9.0.58=0
109
- - libcufft-dev=10.9.0.58=0
110
- - libcufile=1.4.0.31=0
111
- - libcufile-dev=1.4.0.31=0
112
- - libcurand=10.3.0.86=0
113
- - libcurl=7.85.0=h91b91d3_0
114
- - libcusolver=11.4.1.48=0
115
- - libcusolver-dev=11.4.1.48=0
116
- - libcusparse=11.7.5.86=0
117
- - libcusparse-dev=11.7.5.86=0
118
- - libdeflate=1.8=h7f8727e_5
119
- - libedit=3.1.20191231=he28a2e2_2
120
- - libev=4.33=h516909a_1
121
- - libevent=2.1.10=h9b69904_4
122
- - libffi=3.3=he6710b0_2
123
- - libgcc-ng=11.2.0=h1234567_1
124
- - libgfortran-ng=12.2.0=h69a702a_19
125
- - libgfortran5=12.2.0=h337968e_19
126
- - libgomp=11.2.0=h1234567_1
127
- - libiconv=1.16=h7f8727e_2
128
- - libidn2=2.3.2=h7f8727e_0
129
- - libnghttp2=1.46.0=hce63b2e_0
130
- - libnpp=11.8.0.86=0
131
- - libnpp-dev=11.8.0.86=0
132
- - libnvjpeg=11.9.0.86=0
133
- - libnvjpeg-dev=11.9.0.86=0
134
- - libpng=1.6.37=hbc83047_0
135
- - libprotobuf=3.20.1=h4ff587b_0
136
- - libssh2=1.10.0=ha56f1ee_2
137
- - libstdcxx-ng=11.2.0=h1234567_1
138
- - libtasn1=4.16.0=h27cfd23_0
139
- - libthrift=0.15.0=he6d91bd_0
140
- - libtiff=4.4.0=hecacb30_0
141
- - libunistring=0.9.10=h27cfd23_0
142
- - libuuid=1.0.3=h7f8727e_2
143
- - libwebp=1.2.4=h11a3e52_0
144
- - libwebp-base=1.2.4=h5eee18b_0
145
- - lz4-c=1.9.3=h295c915_1
146
- - markdown=3.4.1=pyhd8ed1ab_0
147
- - markupsafe=2.1.1=py310h5764c6d_1
148
- - mkl=2021.4.0=h06a4308_640
149
- - mkl-service=2.4.0=py310h7f8727e_0
150
- - mkl_fft=1.3.1=py310hd6ae3a3_0
151
- - mkl_random=1.2.2=py310h00e6091_0
152
- - multidict=6.0.2=py310h5764c6d_1
153
- - multiprocess=0.70.12.2=py310h5764c6d_2
154
- - natsort=7.1.1=pyhd3eb1b0_0
155
- - ncurses=6.3=h5eee18b_3
156
- - nettle=3.7.3=hbbd107a_1
157
- - nsight-compute=2022.3.0.22=0
158
- - numexpr=2.8.3=py310hcea2de6_0
159
- - numpy=1.23.3=py310hd5efca6_0
160
- #- numpy-base=1.23.3=py310h8e6c178_0
161
- - oauthlib=3.2.2=pyhd8ed1ab_0
162
- - openh264=2.1.1=h4ff587b_0
163
- - openssl=1.1.1s=h7f8727e_0
164
- - orc=1.7.4=h07ed6aa_0
165
- - packaging=21.3=pyhd8ed1ab_0
166
- - pandas=1.4.4=py310h6a678d5_0
167
- - pillow=9.2.0=py310hace64e9_1
168
- - pip=22.2.2=py310h06a4308_0
169
- - protobuf=3.20.1=py310hd8f1fbe_0
170
- - pyarrow=8.0.0=py310h468efa6_0
171
- - pyasn1=0.4.8=py_0
172
- - pyasn1-modules=0.2.7=py_0
173
- - pycparser=2.21=pyhd3eb1b0_0
174
- - pyjwt=2.6.0=pyhd8ed1ab_0
175
- - pyopenssl=22.0.0=pyhd3eb1b0_0
176
- - pyparsing=3.0.9=pyhd8ed1ab_0
177
- - pysocks=1.7.1=py310h06a4308_0
178
- - python=3.10.6=haa1d7c7_1
179
- - python-dateutil=2.8.2=pyhd8ed1ab_0
180
- - python-xxhash=3.0.0=py310h5764c6d_1
181
- - python_abi=3.10=2_cp310
182
- - pytorch=1.13.0=py3.10_cuda11.7_cudnn8.5.0_0
183
- - pytorch-cuda=11.7=h67b0de4_0
184
- - pytorch-mutex=1.0=cuda
185
- - pytz=2022.6=pyhd8ed1ab_0
186
- - pyu2f=0.1.5=pyhd8ed1ab_0
187
- - pyyaml=6.0=py310h5764c6d_4
188
- - re2=2022.04.01=h27087fc_0
189
- - readline=8.2=h5eee18b_0
190
- - regex=2022.7.9=py310h5eee18b_0
191
- - requests=2.28.1=py310h06a4308_0
192
- - requests-oauthlib=1.3.1=pyhd8ed1ab_0
193
- - responses=0.18.0=pyhd8ed1ab_0
194
- - rsa=4.9=pyhd8ed1ab_0
195
- - sacremoses=0.0.53=pyhd8ed1ab_0
196
- - scikit-learn=1.1.3=py310h6a678d5_0
197
- - scipy=1.9.3=py310hd5efca6_0
198
- - seqeval=1.2.2=pyhd3deb0d_0
199
- - setuptools=65.4.0=py310h06a4308_0
200
- - six=1.16.0=pyhd3eb1b0_1
201
- - snappy=1.1.9=hbd366e4_1
202
- - sqlite=3.39.3=h5082296_0
203
- - tensorboard=2.10.1=pyhd8ed1ab_0
204
- - tensorboard-data-server=0.6.0=py310h597c629_2
205
- - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
206
- - threadpoolctl=3.1.0=pyh8a188c0_0
207
- - tk=8.6.12=h1ccaba5_0
208
- - tokenizers=0.11.4=py310h3dcd8bd_1
209
- - torchaudio=0.13.0=py310_cu117
210
- - torchtext=0.14.0=py310
211
- - torchvision=0.14.0=py310_cu117
212
- - tqdm=4.64.1=py310h06a4308_0
213
- - transformers=4.24.0=pyhd8ed1ab_0
214
- - typing-extensions=4.3.0=py310h06a4308_0
215
- - typing_extensions=4.3.0=py310h06a4308_0
216
- - tzdata=2022e=h04d1e81_0
217
- - urllib3=1.26.12=py310h06a4308_0
218
- - utf8proc=2.6.1=h27cfd23_0
219
- - werkzeug=2.2.2=pyhd8ed1ab_0
220
- - wheel=0.37.1=pyhd3eb1b0_0
221
- - xxhash=0.8.0=h7f98852_3
222
- - xz=5.2.6=h5eee18b_0
223
- - yaml=0.2.5=h7f98852_2
224
- - yarl=1.7.2=py310h5764c6d_2
225
- - zipp=3.10.0=pyhd8ed1ab_0
226
- - zlib=1.2.13=h5eee18b_0
227
- - zstd=1.5.2=ha4553b6_0