pyxllib 0.3.96__py3-none-any.whl → 0.3.200__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. pyxllib/__init__.py +21 -21
  2. pyxllib/algo/__init__.py +8 -8
  3. pyxllib/algo/disjoint.py +54 -54
  4. pyxllib/algo/geo.py +541 -529
  5. pyxllib/algo/intervals.py +964 -964
  6. pyxllib/algo/matcher.py +389 -311
  7. pyxllib/algo/newbie.py +166 -166
  8. pyxllib/algo/pupil.py +629 -461
  9. pyxllib/algo/shapelylib.py +67 -67
  10. pyxllib/algo/specialist.py +241 -240
  11. pyxllib/algo/stat.py +494 -458
  12. pyxllib/algo/treelib.py +149 -149
  13. pyxllib/algo/unitlib.py +66 -66
  14. {pyxlpr → pyxllib/autogui}/__init__.py +5 -5
  15. pyxllib/autogui/activewin.py +246 -0
  16. pyxllib/autogui/all.py +9 -0
  17. pyxllib/{ext/autogui → autogui}/autogui.py +852 -823
  18. pyxllib/autogui/uiautolib.py +362 -0
  19. pyxllib/{ext/autogui → autogui}/virtualkey.py +102 -102
  20. pyxllib/autogui/wechat.py +827 -0
  21. pyxllib/autogui/wechat_msg.py +421 -0
  22. pyxllib/autogui/wxautolib.py +84 -0
  23. pyxllib/cv/__init__.py +5 -5
  24. pyxllib/cv/expert.py +267 -267
  25. pyxllib/cv/imfile.py +159 -159
  26. pyxllib/cv/imhash.py +39 -39
  27. pyxllib/cv/pupil.py +9 -9
  28. pyxllib/cv/rgbfmt.py +1525 -1525
  29. pyxllib/cv/slidercaptcha.py +137 -0
  30. pyxllib/cv/trackbartools.py +251 -251
  31. pyxllib/cv/xlcvlib.py +1040 -1040
  32. pyxllib/cv/xlpillib.py +423 -423
  33. pyxllib/data/echarts.py +240 -129
  34. pyxllib/data/jsonlib.py +89 -0
  35. pyxllib/data/oss.py +72 -72
  36. pyxllib/data/pglib.py +1127 -643
  37. pyxllib/data/sqlite.py +568 -341
  38. pyxllib/data/sqllib.py +297 -297
  39. pyxllib/ext/JLineViewer.py +505 -492
  40. pyxllib/ext/__init__.py +6 -6
  41. pyxllib/ext/demolib.py +246 -246
  42. pyxllib/ext/drissionlib.py +277 -0
  43. pyxllib/ext/kq5034lib.py +12 -1606
  44. pyxllib/ext/old.py +663 -663
  45. pyxllib/ext/qt.py +449 -449
  46. pyxllib/ext/robustprocfile.py +497 -0
  47. pyxllib/ext/seleniumlib.py +76 -76
  48. pyxllib/ext/tk.py +173 -173
  49. pyxllib/ext/unixlib.py +827 -826
  50. pyxllib/ext/utools.py +351 -338
  51. pyxllib/ext/webhook.py +124 -101
  52. pyxllib/ext/win32lib.py +40 -40
  53. pyxllib/ext/wjxlib.py +88 -0
  54. pyxllib/ext/wpsapi.py +124 -0
  55. pyxllib/ext/xlwork.py +9 -0
  56. pyxllib/ext/yuquelib.py +1105 -173
  57. pyxllib/file/__init__.py +17 -17
  58. pyxllib/file/docxlib.py +761 -761
  59. pyxllib/file/gitlib.py +309 -309
  60. pyxllib/file/libreoffice.py +165 -0
  61. pyxllib/file/movielib.py +148 -139
  62. pyxllib/file/newbie.py +10 -10
  63. pyxllib/file/onenotelib.py +1469 -1469
  64. pyxllib/file/packlib/__init__.py +330 -293
  65. pyxllib/file/packlib/zipfile.py +2441 -2441
  66. pyxllib/file/pdflib.py +426 -426
  67. pyxllib/file/pupil.py +185 -185
  68. pyxllib/file/specialist/__init__.py +685 -685
  69. pyxllib/file/specialist/dirlib.py +799 -799
  70. pyxllib/file/specialist/download.py +193 -186
  71. pyxllib/file/specialist/filelib.py +2829 -2618
  72. pyxllib/file/xlsxlib.py +3131 -2976
  73. pyxllib/file/xlsyncfile.py +341 -0
  74. pyxllib/prog/__init__.py +5 -5
  75. pyxllib/prog/cachetools.py +64 -0
  76. pyxllib/prog/deprecatedlib.py +233 -233
  77. pyxllib/prog/filelock.py +42 -0
  78. pyxllib/prog/ipyexec.py +253 -253
  79. pyxllib/prog/multiprogs.py +940 -0
  80. pyxllib/prog/newbie.py +451 -444
  81. pyxllib/prog/pupil.py +1197 -1128
  82. pyxllib/prog/sitepackages.py +33 -33
  83. pyxllib/prog/specialist/__init__.py +391 -217
  84. pyxllib/prog/specialist/bc.py +203 -200
  85. pyxllib/prog/specialist/browser.py +497 -488
  86. pyxllib/prog/specialist/common.py +347 -347
  87. pyxllib/prog/specialist/datetime.py +199 -131
  88. pyxllib/prog/specialist/tictoc.py +240 -241
  89. pyxllib/prog/specialist/xllog.py +180 -180
  90. pyxllib/prog/xlosenv.py +108 -101
  91. pyxllib/stdlib/__init__.py +17 -17
  92. pyxllib/stdlib/tablepyxl/__init__.py +10 -10
  93. pyxllib/stdlib/tablepyxl/style.py +303 -303
  94. pyxllib/stdlib/tablepyxl/tablepyxl.py +130 -130
  95. pyxllib/text/__init__.py +8 -8
  96. pyxllib/text/ahocorasick.py +39 -39
  97. pyxllib/text/airscript.js +744 -0
  98. pyxllib/text/charclasslib.py +121 -109
  99. pyxllib/text/jiebalib.py +267 -264
  100. pyxllib/text/jinjalib.py +32 -0
  101. pyxllib/text/jsa_ai_prompt.md +271 -0
  102. pyxllib/text/jscode.py +922 -767
  103. pyxllib/text/latex/__init__.py +158 -158
  104. pyxllib/text/levenshtein.py +303 -303
  105. pyxllib/text/nestenv.py +1215 -1215
  106. pyxllib/text/newbie.py +300 -288
  107. pyxllib/text/pupil/__init__.py +8 -8
  108. pyxllib/text/pupil/common.py +1121 -1095
  109. pyxllib/text/pupil/xlalign.py +326 -326
  110. pyxllib/text/pycode.py +47 -47
  111. pyxllib/text/specialist/__init__.py +8 -8
  112. pyxllib/text/specialist/common.py +112 -112
  113. pyxllib/text/specialist/ptag.py +186 -186
  114. pyxllib/text/spellchecker.py +172 -172
  115. pyxllib/text/templates/echart_base.html +11 -0
  116. pyxllib/text/templates/highlight_code.html +17 -0
  117. pyxllib/text/templates/latex_editor.html +103 -0
  118. pyxllib/text/vbacode.py +17 -17
  119. pyxllib/text/xmllib.py +747 -685
  120. pyxllib/xl.py +42 -38
  121. pyxllib/xlcv.py +17 -17
  122. pyxllib-0.3.200.dist-info/METADATA +48 -0
  123. pyxllib-0.3.200.dist-info/RECORD +126 -0
  124. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info}/WHEEL +1 -2
  125. {pyxllib-0.3.96.dist-info → pyxllib-0.3.200.dist-info/licenses}/LICENSE +190 -190
  126. pyxllib/ext/autogui/__init__.py +0 -8
  127. pyxllib-0.3.96.dist-info/METADATA +0 -51
  128. pyxllib-0.3.96.dist-info/RECORD +0 -333
  129. pyxllib-0.3.96.dist-info/top_level.txt +0 -2
  130. pyxlpr/ai/__init__.py +0 -5
  131. pyxlpr/ai/clientlib.py +0 -1281
  132. pyxlpr/ai/specialist.py +0 -286
  133. pyxlpr/ai/torch_app.py +0 -172
  134. pyxlpr/ai/xlpaddle.py +0 -655
  135. pyxlpr/ai/xltorch.py +0 -705
  136. pyxlpr/data/__init__.py +0 -11
  137. pyxlpr/data/coco.py +0 -1325
  138. pyxlpr/data/datacls.py +0 -365
  139. pyxlpr/data/datasets.py +0 -200
  140. pyxlpr/data/gptlib.py +0 -1291
  141. pyxlpr/data/icdar/__init__.py +0 -96
  142. pyxlpr/data/icdar/deteval.py +0 -377
  143. pyxlpr/data/icdar/icdar2013.py +0 -341
  144. pyxlpr/data/icdar/iou.py +0 -340
  145. pyxlpr/data/icdar/rrc_evaluation_funcs_1_1.py +0 -463
  146. pyxlpr/data/imtextline.py +0 -473
  147. pyxlpr/data/labelme.py +0 -866
  148. pyxlpr/data/removeline.py +0 -179
  149. pyxlpr/data/specialist.py +0 -57
  150. pyxlpr/eval/__init__.py +0 -85
  151. pyxlpr/paddleocr.py +0 -776
  152. pyxlpr/ppocr/__init__.py +0 -15
  153. pyxlpr/ppocr/configs/rec/multi_language/generate_multi_language_configs.py +0 -226
  154. pyxlpr/ppocr/data/__init__.py +0 -135
  155. pyxlpr/ppocr/data/imaug/ColorJitter.py +0 -26
  156. pyxlpr/ppocr/data/imaug/__init__.py +0 -67
  157. pyxlpr/ppocr/data/imaug/copy_paste.py +0 -170
  158. pyxlpr/ppocr/data/imaug/east_process.py +0 -437
  159. pyxlpr/ppocr/data/imaug/gen_table_mask.py +0 -244
  160. pyxlpr/ppocr/data/imaug/iaa_augment.py +0 -114
  161. pyxlpr/ppocr/data/imaug/label_ops.py +0 -789
  162. pyxlpr/ppocr/data/imaug/make_border_map.py +0 -184
  163. pyxlpr/ppocr/data/imaug/make_pse_gt.py +0 -106
  164. pyxlpr/ppocr/data/imaug/make_shrink_map.py +0 -126
  165. pyxlpr/ppocr/data/imaug/operators.py +0 -433
  166. pyxlpr/ppocr/data/imaug/pg_process.py +0 -906
  167. pyxlpr/ppocr/data/imaug/randaugment.py +0 -143
  168. pyxlpr/ppocr/data/imaug/random_crop_data.py +0 -239
  169. pyxlpr/ppocr/data/imaug/rec_img_aug.py +0 -533
  170. pyxlpr/ppocr/data/imaug/sast_process.py +0 -777
  171. pyxlpr/ppocr/data/imaug/text_image_aug/__init__.py +0 -17
  172. pyxlpr/ppocr/data/imaug/text_image_aug/augment.py +0 -120
  173. pyxlpr/ppocr/data/imaug/text_image_aug/warp_mls.py +0 -168
  174. pyxlpr/ppocr/data/lmdb_dataset.py +0 -115
  175. pyxlpr/ppocr/data/pgnet_dataset.py +0 -104
  176. pyxlpr/ppocr/data/pubtab_dataset.py +0 -107
  177. pyxlpr/ppocr/data/simple_dataset.py +0 -372
  178. pyxlpr/ppocr/losses/__init__.py +0 -61
  179. pyxlpr/ppocr/losses/ace_loss.py +0 -52
  180. pyxlpr/ppocr/losses/basic_loss.py +0 -135
  181. pyxlpr/ppocr/losses/center_loss.py +0 -88
  182. pyxlpr/ppocr/losses/cls_loss.py +0 -30
  183. pyxlpr/ppocr/losses/combined_loss.py +0 -67
  184. pyxlpr/ppocr/losses/det_basic_loss.py +0 -208
  185. pyxlpr/ppocr/losses/det_db_loss.py +0 -80
  186. pyxlpr/ppocr/losses/det_east_loss.py +0 -63
  187. pyxlpr/ppocr/losses/det_pse_loss.py +0 -149
  188. pyxlpr/ppocr/losses/det_sast_loss.py +0 -121
  189. pyxlpr/ppocr/losses/distillation_loss.py +0 -272
  190. pyxlpr/ppocr/losses/e2e_pg_loss.py +0 -140
  191. pyxlpr/ppocr/losses/kie_sdmgr_loss.py +0 -113
  192. pyxlpr/ppocr/losses/rec_aster_loss.py +0 -99
  193. pyxlpr/ppocr/losses/rec_att_loss.py +0 -39
  194. pyxlpr/ppocr/losses/rec_ctc_loss.py +0 -44
  195. pyxlpr/ppocr/losses/rec_enhanced_ctc_loss.py +0 -70
  196. pyxlpr/ppocr/losses/rec_nrtr_loss.py +0 -30
  197. pyxlpr/ppocr/losses/rec_sar_loss.py +0 -28
  198. pyxlpr/ppocr/losses/rec_srn_loss.py +0 -47
  199. pyxlpr/ppocr/losses/table_att_loss.py +0 -109
  200. pyxlpr/ppocr/metrics/__init__.py +0 -44
  201. pyxlpr/ppocr/metrics/cls_metric.py +0 -45
  202. pyxlpr/ppocr/metrics/det_metric.py +0 -82
  203. pyxlpr/ppocr/metrics/distillation_metric.py +0 -73
  204. pyxlpr/ppocr/metrics/e2e_metric.py +0 -86
  205. pyxlpr/ppocr/metrics/eval_det_iou.py +0 -274
  206. pyxlpr/ppocr/metrics/kie_metric.py +0 -70
  207. pyxlpr/ppocr/metrics/rec_metric.py +0 -75
  208. pyxlpr/ppocr/metrics/table_metric.py +0 -50
  209. pyxlpr/ppocr/modeling/architectures/__init__.py +0 -32
  210. pyxlpr/ppocr/modeling/architectures/base_model.py +0 -88
  211. pyxlpr/ppocr/modeling/architectures/distillation_model.py +0 -60
  212. pyxlpr/ppocr/modeling/backbones/__init__.py +0 -54
  213. pyxlpr/ppocr/modeling/backbones/det_mobilenet_v3.py +0 -268
  214. pyxlpr/ppocr/modeling/backbones/det_resnet_vd.py +0 -246
  215. pyxlpr/ppocr/modeling/backbones/det_resnet_vd_sast.py +0 -285
  216. pyxlpr/ppocr/modeling/backbones/e2e_resnet_vd_pg.py +0 -265
  217. pyxlpr/ppocr/modeling/backbones/kie_unet_sdmgr.py +0 -186
  218. pyxlpr/ppocr/modeling/backbones/rec_mobilenet_v3.py +0 -138
  219. pyxlpr/ppocr/modeling/backbones/rec_mv1_enhance.py +0 -258
  220. pyxlpr/ppocr/modeling/backbones/rec_nrtr_mtb.py +0 -48
  221. pyxlpr/ppocr/modeling/backbones/rec_resnet_31.py +0 -210
  222. pyxlpr/ppocr/modeling/backbones/rec_resnet_aster.py +0 -143
  223. pyxlpr/ppocr/modeling/backbones/rec_resnet_fpn.py +0 -307
  224. pyxlpr/ppocr/modeling/backbones/rec_resnet_vd.py +0 -286
  225. pyxlpr/ppocr/modeling/heads/__init__.py +0 -54
  226. pyxlpr/ppocr/modeling/heads/cls_head.py +0 -52
  227. pyxlpr/ppocr/modeling/heads/det_db_head.py +0 -118
  228. pyxlpr/ppocr/modeling/heads/det_east_head.py +0 -121
  229. pyxlpr/ppocr/modeling/heads/det_pse_head.py +0 -37
  230. pyxlpr/ppocr/modeling/heads/det_sast_head.py +0 -128
  231. pyxlpr/ppocr/modeling/heads/e2e_pg_head.py +0 -253
  232. pyxlpr/ppocr/modeling/heads/kie_sdmgr_head.py +0 -206
  233. pyxlpr/ppocr/modeling/heads/multiheadAttention.py +0 -163
  234. pyxlpr/ppocr/modeling/heads/rec_aster_head.py +0 -393
  235. pyxlpr/ppocr/modeling/heads/rec_att_head.py +0 -202
  236. pyxlpr/ppocr/modeling/heads/rec_ctc_head.py +0 -88
  237. pyxlpr/ppocr/modeling/heads/rec_nrtr_head.py +0 -826
  238. pyxlpr/ppocr/modeling/heads/rec_sar_head.py +0 -402
  239. pyxlpr/ppocr/modeling/heads/rec_srn_head.py +0 -280
  240. pyxlpr/ppocr/modeling/heads/self_attention.py +0 -406
  241. pyxlpr/ppocr/modeling/heads/table_att_head.py +0 -246
  242. pyxlpr/ppocr/modeling/necks/__init__.py +0 -32
  243. pyxlpr/ppocr/modeling/necks/db_fpn.py +0 -111
  244. pyxlpr/ppocr/modeling/necks/east_fpn.py +0 -188
  245. pyxlpr/ppocr/modeling/necks/fpn.py +0 -138
  246. pyxlpr/ppocr/modeling/necks/pg_fpn.py +0 -314
  247. pyxlpr/ppocr/modeling/necks/rnn.py +0 -92
  248. pyxlpr/ppocr/modeling/necks/sast_fpn.py +0 -284
  249. pyxlpr/ppocr/modeling/necks/table_fpn.py +0 -110
  250. pyxlpr/ppocr/modeling/transforms/__init__.py +0 -28
  251. pyxlpr/ppocr/modeling/transforms/stn.py +0 -135
  252. pyxlpr/ppocr/modeling/transforms/tps.py +0 -308
  253. pyxlpr/ppocr/modeling/transforms/tps_spatial_transformer.py +0 -156
  254. pyxlpr/ppocr/optimizer/__init__.py +0 -61
  255. pyxlpr/ppocr/optimizer/learning_rate.py +0 -228
  256. pyxlpr/ppocr/optimizer/lr_scheduler.py +0 -49
  257. pyxlpr/ppocr/optimizer/optimizer.py +0 -160
  258. pyxlpr/ppocr/optimizer/regularizer.py +0 -52
  259. pyxlpr/ppocr/postprocess/__init__.py +0 -55
  260. pyxlpr/ppocr/postprocess/cls_postprocess.py +0 -33
  261. pyxlpr/ppocr/postprocess/db_postprocess.py +0 -234
  262. pyxlpr/ppocr/postprocess/east_postprocess.py +0 -143
  263. pyxlpr/ppocr/postprocess/locality_aware_nms.py +0 -200
  264. pyxlpr/ppocr/postprocess/pg_postprocess.py +0 -52
  265. pyxlpr/ppocr/postprocess/pse_postprocess/__init__.py +0 -15
  266. pyxlpr/ppocr/postprocess/pse_postprocess/pse/__init__.py +0 -29
  267. pyxlpr/ppocr/postprocess/pse_postprocess/pse/setup.py +0 -14
  268. pyxlpr/ppocr/postprocess/pse_postprocess/pse_postprocess.py +0 -118
  269. pyxlpr/ppocr/postprocess/rec_postprocess.py +0 -654
  270. pyxlpr/ppocr/postprocess/sast_postprocess.py +0 -355
  271. pyxlpr/ppocr/tools/__init__.py +0 -14
  272. pyxlpr/ppocr/tools/eval.py +0 -83
  273. pyxlpr/ppocr/tools/export_center.py +0 -77
  274. pyxlpr/ppocr/tools/export_model.py +0 -129
  275. pyxlpr/ppocr/tools/infer/predict_cls.py +0 -151
  276. pyxlpr/ppocr/tools/infer/predict_det.py +0 -300
  277. pyxlpr/ppocr/tools/infer/predict_e2e.py +0 -169
  278. pyxlpr/ppocr/tools/infer/predict_rec.py +0 -414
  279. pyxlpr/ppocr/tools/infer/predict_system.py +0 -204
  280. pyxlpr/ppocr/tools/infer/utility.py +0 -629
  281. pyxlpr/ppocr/tools/infer_cls.py +0 -83
  282. pyxlpr/ppocr/tools/infer_det.py +0 -134
  283. pyxlpr/ppocr/tools/infer_e2e.py +0 -122
  284. pyxlpr/ppocr/tools/infer_kie.py +0 -153
  285. pyxlpr/ppocr/tools/infer_rec.py +0 -146
  286. pyxlpr/ppocr/tools/infer_table.py +0 -107
  287. pyxlpr/ppocr/tools/program.py +0 -596
  288. pyxlpr/ppocr/tools/test_hubserving.py +0 -117
  289. pyxlpr/ppocr/tools/train.py +0 -163
  290. pyxlpr/ppocr/tools/xlprog.py +0 -748
  291. pyxlpr/ppocr/utils/EN_symbol_dict.txt +0 -94
  292. pyxlpr/ppocr/utils/__init__.py +0 -24
  293. pyxlpr/ppocr/utils/dict/ar_dict.txt +0 -117
  294. pyxlpr/ppocr/utils/dict/arabic_dict.txt +0 -162
  295. pyxlpr/ppocr/utils/dict/be_dict.txt +0 -145
  296. pyxlpr/ppocr/utils/dict/bg_dict.txt +0 -140
  297. pyxlpr/ppocr/utils/dict/chinese_cht_dict.txt +0 -8421
  298. pyxlpr/ppocr/utils/dict/cyrillic_dict.txt +0 -163
  299. pyxlpr/ppocr/utils/dict/devanagari_dict.txt +0 -167
  300. pyxlpr/ppocr/utils/dict/en_dict.txt +0 -63
  301. pyxlpr/ppocr/utils/dict/fa_dict.txt +0 -136
  302. pyxlpr/ppocr/utils/dict/french_dict.txt +0 -136
  303. pyxlpr/ppocr/utils/dict/german_dict.txt +0 -143
  304. pyxlpr/ppocr/utils/dict/hi_dict.txt +0 -162
  305. pyxlpr/ppocr/utils/dict/it_dict.txt +0 -118
  306. pyxlpr/ppocr/utils/dict/japan_dict.txt +0 -4399
  307. pyxlpr/ppocr/utils/dict/ka_dict.txt +0 -153
  308. pyxlpr/ppocr/utils/dict/korean_dict.txt +0 -3688
  309. pyxlpr/ppocr/utils/dict/latin_dict.txt +0 -185
  310. pyxlpr/ppocr/utils/dict/mr_dict.txt +0 -153
  311. pyxlpr/ppocr/utils/dict/ne_dict.txt +0 -153
  312. pyxlpr/ppocr/utils/dict/oc_dict.txt +0 -96
  313. pyxlpr/ppocr/utils/dict/pu_dict.txt +0 -130
  314. pyxlpr/ppocr/utils/dict/rs_dict.txt +0 -91
  315. pyxlpr/ppocr/utils/dict/rsc_dict.txt +0 -134
  316. pyxlpr/ppocr/utils/dict/ru_dict.txt +0 -125
  317. pyxlpr/ppocr/utils/dict/ta_dict.txt +0 -128
  318. pyxlpr/ppocr/utils/dict/table_dict.txt +0 -277
  319. pyxlpr/ppocr/utils/dict/table_structure_dict.txt +0 -2759
  320. pyxlpr/ppocr/utils/dict/te_dict.txt +0 -151
  321. pyxlpr/ppocr/utils/dict/ug_dict.txt +0 -114
  322. pyxlpr/ppocr/utils/dict/uk_dict.txt +0 -142
  323. pyxlpr/ppocr/utils/dict/ur_dict.txt +0 -137
  324. pyxlpr/ppocr/utils/dict/xi_dict.txt +0 -110
  325. pyxlpr/ppocr/utils/dict90.txt +0 -90
  326. pyxlpr/ppocr/utils/e2e_metric/Deteval.py +0 -574
  327. pyxlpr/ppocr/utils/e2e_metric/polygon_fast.py +0 -83
  328. pyxlpr/ppocr/utils/e2e_utils/extract_batchsize.py +0 -87
  329. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_fast.py +0 -457
  330. pyxlpr/ppocr/utils/e2e_utils/extract_textpoint_slow.py +0 -592
  331. pyxlpr/ppocr/utils/e2e_utils/pgnet_pp_utils.py +0 -162
  332. pyxlpr/ppocr/utils/e2e_utils/visual.py +0 -162
  333. pyxlpr/ppocr/utils/en_dict.txt +0 -95
  334. pyxlpr/ppocr/utils/gen_label.py +0 -81
  335. pyxlpr/ppocr/utils/ic15_dict.txt +0 -36
  336. pyxlpr/ppocr/utils/iou.py +0 -54
  337. pyxlpr/ppocr/utils/logging.py +0 -69
  338. pyxlpr/ppocr/utils/network.py +0 -84
  339. pyxlpr/ppocr/utils/ppocr_keys_v1.txt +0 -6623
  340. pyxlpr/ppocr/utils/profiler.py +0 -110
  341. pyxlpr/ppocr/utils/save_load.py +0 -150
  342. pyxlpr/ppocr/utils/stats.py +0 -72
  343. pyxlpr/ppocr/utils/utility.py +0 -80
  344. pyxlpr/ppstructure/__init__.py +0 -13
  345. pyxlpr/ppstructure/predict_system.py +0 -187
  346. pyxlpr/ppstructure/table/__init__.py +0 -13
  347. pyxlpr/ppstructure/table/eval_table.py +0 -72
  348. pyxlpr/ppstructure/table/matcher.py +0 -192
  349. pyxlpr/ppstructure/table/predict_structure.py +0 -136
  350. pyxlpr/ppstructure/table/predict_table.py +0 -221
  351. pyxlpr/ppstructure/table/table_metric/__init__.py +0 -16
  352. pyxlpr/ppstructure/table/table_metric/parallel.py +0 -51
  353. pyxlpr/ppstructure/table/table_metric/table_metric.py +0 -247
  354. pyxlpr/ppstructure/table/tablepyxl/__init__.py +0 -13
  355. pyxlpr/ppstructure/table/tablepyxl/style.py +0 -283
  356. pyxlpr/ppstructure/table/tablepyxl/tablepyxl.py +0 -118
  357. pyxlpr/ppstructure/utility.py +0 -71
  358. pyxlpr/xlai.py +0 -10
@@ -1,685 +1,685 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- # @Author : 陈坤泽
4
- # @Email : 877362867@qq.com
5
- # @Date : 2021/06/06 17:46
6
- import re
7
-
8
- from pyxllib.prog.pupil import check_install_package
9
-
10
- check_install_package('joblib', 'joblib>=1.3.2')
11
-
12
- from collections import OrderedDict
13
- import sqlite3
14
-
15
- from joblib import Parallel, delayed
16
-
17
- from pyxllib.file.specialist.filelib import *
18
- from pyxllib.file.specialist.dirlib import *
19
- from pyxllib.file.specialist.download import *
20
-
21
-
22
- def merge_jsonl(*infiles):
23
- data = []
24
- for f in infiles:
25
- data += XlPath(f).read_jsonl()
26
- return data
27
-
28
-
29
- class JsonlDataFile:
30
- """ 通用的jsonl文件处理类 """
31
-
32
- def __init__(self, filepath=None, num_records=None):
33
- """
34
- 从指定的jsonl文件中读取数据。可以选择读取全部数据或只读取前N条数据。
35
-
36
- :param str filepath: jsonl文件的路径
37
- :param int num_records: 指定读取的记录数量,如果为None则读取全部数据
38
- """
39
- self.infile = None
40
- self.records = []
41
-
42
- if filepath is not None:
43
- filepath = XlPath(filepath)
44
- if '?k' in filepath.name: # 如果文件名中有'?',则需要进行模式匹配检索
45
- new_name = filepath.name.replace('?k', '*')
46
- filepaths = list(filepath.parent.glob(new_name))
47
- if filepaths:
48
- filepath = filepaths[0] # 找到第1个匹配的文件
49
- self.infile = XlPath(filepath)
50
- else:
51
- self.infile = filepath
52
-
53
- if self.infile and self.infile.is_file(): # 机制上文件也可能不存在的,有可能只是一个预设目录~
54
- if num_records is None:
55
- # 读取全部数据
56
- if self.infile.is_file():
57
- self.records = self.infile.read_jsonl()
58
- else:
59
- # 只读取部分数据
60
- self.read_partial_records(num_records)
61
-
62
- def __len__(self):
63
- return len(self.records)
64
-
65
- def yield_record(self, start=0, end=None, step=1, batch_size=None):
66
- """ 返回指定区间的记录
67
-
68
- :param int start: 起始记录索引,默认为0
69
- :param int end: 结束记录索引,默认为None(读取到记录末尾)
70
- :param int step: 步长,默认为1
71
- :param int batch_size: 每批返回的记录数,如果为None,则逐记录返回
72
- """
73
- total_records = len(self.records) # 获取总记录数
74
-
75
- # 处理负索引
76
- if start < 0 or (end is not None and end < 0):
77
- if start < 0:
78
- start = total_records + start
79
- if end is not None and end < 0:
80
- end = total_records + end
81
-
82
- iterator = islice(self.records, start, end, step)
83
- while True:
84
- batch = list(islice(iterator, batch_size))
85
- if not batch:
86
- break
87
- if batch_size is None:
88
- yield from batch
89
- else:
90
- yield batch
91
-
92
- def yield_group(self, key, sort_mode='keep'):
93
- """ 分组提取数据
94
-
95
- :param key: 一个函数,对record的映射,通过这个映射规则来分组
96
- :param sort_mode:
97
- keep: 保留原本的相对顺序
98
- id: 按照id的值进行排序
99
- sort: 按照key的值进行排序
100
- """
101
- # 1 创建一个默认字典来保存分组
102
- grouped_data = OrderedDict()
103
-
104
- records = self.records
105
- if sort_mode == 'id':
106
- records = sorted(records, key=lambda x: x['id'])
107
-
108
- # 2 对数据进行分组
109
- for record in records:
110
- k = key(record)
111
- if k not in grouped_data:
112
- grouped_data[k] = [record]
113
- else:
114
- grouped_data[k].append(record)
115
-
116
- # 3 将分组的数据重新排序并合并为一个新列表
117
- # 并且在这里可以进行一些分组信息的计算
118
- if sort_mode == 'sort':
119
- grouped_data = {k: grouped_data[k] for k in sorted(grouped_data.keys())}
120
-
121
- # 4 返回分组后的数据
122
- yield from grouped_data.values()
123
-
124
- def read_partial_records(self, num_records):
125
- """ 从jsonl文件中只读取指定数量的记录 """
126
- if self.infile and self.infile.is_file():
127
- try:
128
- lines = next(self.infile.yield_line(batch_size=num_records))
129
- for line in lines:
130
- self.records.append(json.loads(line))
131
- except StopIteration:
132
- self.records = []
133
-
134
- def save(self, outfile=None, ensure_ascii=False, json_encoder=None):
135
- """ 将当前数据保存到指定的jsonl文件中 """
136
- if outfile is None: # 默认保存回原文件
137
- outfile = self.infile
138
- p = XlPath(outfile)
139
-
140
- # 如果文件名包含'?k',则替换'?'为self.records的数量
141
- if m := re.search(r'\?k', p.name):
142
- n = len(self.records)
143
- if n < 500:
144
- replace_str = f'{n}' # 数量小于500,直接给出数量
145
- else:
146
- v = int(round(n / 1000)) # 数量大于等于500,以"千"为单位'k',四舍五入计算
147
- replace_str = f'{v}k'
148
- # 用新字符串替换原来的字符串
149
- new_name = re.sub(r'\?k', replace_str, p.name)
150
- p = p.with_name(new_name) # 更改文件名
151
-
152
- p.parent.mkdir(parents=True, exist_ok=True)
153
- p.write_jsonl(self.records, ensure_ascii=ensure_ascii, default=json_encoder)
154
-
155
- def browse_record(self, index=None, paths=None, **kwargs):
156
- """ 在浏览器中显示指定记录的内容 """
157
- from pyxllib.prog.specialist import browser
158
-
159
- # 如果未提供索引,则尝试使用查询参数找到第一个匹配的记录
160
- if index is None:
161
- index = self.find_index(paths, **kwargs)
162
- if index is None:
163
- raise ValueError('No matching record found')
164
-
165
- record = self.records[index]
166
- html_content = ['<html><body><pre>',
167
- json.dumps(record, ensure_ascii=False, indent=4),
168
- '</pre></body></html>']
169
- html_file = (XlPath.tempdir() / f'{self.__class__.__name__}_{index}.html')
170
- html_file.write_text('\n'.join(html_content))
171
- browser.html(html_file)
172
-
173
- def browse_records(self, indices=None, paths=None, **kwargs):
174
- """ 在浏览器中显示所有匹配的记录 """
175
- from pyxllib.prog.specialist import browser
176
-
177
- if indices is None:
178
- indices = list(self.find_indexs(paths, **kwargs))
179
- if not indices:
180
- raise ValueError('No matching records found')
181
-
182
- html_content = ['<html><body><h1>Matching Records: {}</h1>'.format(len(indices))]
183
-
184
- for index in indices:
185
- record = self.records[index]
186
- html_content.extend([
187
- '<h2>Record {}</h2>'.format(index),
188
- '<pre>',
189
- json.dumps(record, ensure_ascii=False, indent=4),
190
- '</pre>'
191
- ])
192
-
193
- html_content.append('</body></html>')
194
- html_file = (XlPath.tempdir() / f'{self.__class__.__name__}_matched.html')
195
- html_file.write_text('\n'.join(html_content))
196
- browser.html(html_file)
197
-
198
- def find_indexs(self, paths=None, **kwargs):
199
- """ 查找满足特定条件的记录的索引,返回所有匹配的结果 """
200
- paths = paths or {}
201
-
202
- for i, record in enumerate(self.records):
203
- # 检查kwargs中的所有条件
204
- for key, value in kwargs.items():
205
- if callable(value):
206
- if not value(record.get(key)):
207
- break
208
- elif record.get(key) != value:
209
- break
210
- else:
211
- # 检查paths中的所有条件
212
- for path, value in paths.items():
213
- try:
214
- actual_value = eval(f'record{path}')
215
- except Exception:
216
- break
217
-
218
- if callable(value):
219
- if not value(actual_value):
220
- break
221
- elif actual_value != value:
222
- break
223
- else:
224
- # 如果记录满足所有条件,则返回它的索引
225
- yield i
226
-
227
- def find_index(self, paths=None, **kwargs):
228
- """
229
- :param dict paths: 在比较复杂场景,无法使用kwargs定位的时候可以用这个规则
230
- key: 检索范式
231
- value: 需要满足的值
232
- 示例: find_index({"['messages'][0]['role']": 'user'})
233
- :param kwargs: 直接子结点名称和对应的值
234
- 示例:find_index(id=2023071320000003)
235
-
236
- 补充说明:
237
- 1、paths和kwargs可以组合使用,表示必须同时满足二者里限定的所有规则
238
- 2、value可以写一个 def func(v)->bool的函数,输入对应的value,返回是否满足条件
239
- """
240
- return next(self.find_indexs(paths, **kwargs), None)
241
-
242
- def add_record_basic(self, **kwargs):
243
- """ 最基础的添加一个条目的接口 """
244
- record = kwargs
245
- self.records.append(record)
246
- return record
247
-
248
- @classmethod
249
- def read_from_files(cls, src_files):
250
- """ 从多个文件中读取并合并数据,并返回新的JsonlDataFile实例 """
251
- merged_records = []
252
- for file in src_files:
253
- jsonl_file = cls(file)
254
- merged_records.extend(jsonl_file.records)
255
- # 创建新的实例并返回
256
- new_instance = cls()
257
- new_instance.records = merged_records
258
- return new_instance
259
-
260
- @classmethod
261
- def read_from_dir(cls, src_dir):
262
- """ 从一个目录下的所有jsonl文件中读取并合并数据,并返回新的JsonlDataFile实例 """
263
- src_dir = XlPath(src_dir)
264
- src_files = [str(file_path) for file_path in src_dir.glob('*.jsonl')]
265
- return cls.read_from_files(src_files)
266
-
267
- def __add__(self, other):
268
- """ 实现类加法操作,合并两个JsonlDataFile的records """
269
- if not isinstance(other, JsonlDataFile):
270
- raise TypeError(f'Unsupported operand type: {type(other)}')
271
- result = JsonlDataFile()
272
- result.records = self.records + other.records
273
- return result
274
-
275
- def __iadd__(self, other):
276
- """ 实现原地加法操作,即 += """
277
- if not isinstance(other, JsonlDataFile):
278
- raise TypeError(f'Unsupported operand type: {type(other)}')
279
- self.records += other.records
280
- return self
281
-
282
- def process_each_record(self, func, *,
283
- inplace=False,
284
- timeout=None,
285
- print_mode=0,
286
- threads_num=1,
287
- **kwargs):
288
- """ 对records中的每个record应用函数func,可以选择是否在原地修改,以及是否显示进度条
289
-
290
- :param function func: 对record进行处理的函数,应接受一个record作为参数并返回处理后的record,如果返回None则删除该record
291
- :param bool inplace: 是否在原地修改records,如果为False(默认),则创建新的JsonlDataFile并返回
292
- :param int print_mode: 是否显示处理过程的进度条,0表示不显示(默认),1表示显示
293
- :return JsonlDataFile or None: 如果inplace为False,则返回新的JsonlDataFile,否则返回None
294
- :param int threads_num: 线程数,默认为1,即单线程
295
-
296
- 遍历self.records,对每个record执行func函数,如果func返回None,则不包含该record到新的records中。
297
- """
298
- backend = 'threading' if threads_num != 1 else 'sequential'
299
-
300
- if print_mode:
301
- parallel = Parallel(n_jobs=threads_num, backend=backend,
302
- timeout=timeout, return_as='generator')
303
- tasks = [delayed(func)(record) for record in self.records]
304
- new_records = []
305
- for y in tqdm(parallel(tasks), total=len(self.records), **kwargs):
306
- if y:
307
- new_records.append(y)
308
- else:
309
- parallel = Parallel(n_jobs=threads_num, backend=backend, timeout=timeout)
310
- tasks = [delayed(func)(record) for record in self.records]
311
- new_records = parallel(tasks)
312
- new_records = [y for y in new_records if y]
313
-
314
- if inplace:
315
- self.records = new_records
316
-
317
- return new_records
318
-
319
- def update_each_record(self, func,
320
- timeout=None,
321
- print_mode=0,
322
- threads_num=1):
323
- """ 遍历并对原始数据进行更改 """
324
- return self.process_each_record(func,
325
- inplace=True,
326
- timeout=timeout,
327
- print_mode=print_mode,
328
- threads_num=threads_num)
329
-
330
-
331
- class JsonlDataDir:
332
- """ 注意这个类开发目标,应该是尽量去模拟JsonDataFile,让下游工作更好衔接统一 """
333
-
334
- def __init__(self, root):
335
- """ 一般用来处理较大的jsonl文件,将其该放到一个目录里,拆分成多个jsonl文件
336
-
337
- 注意待处理的文件名是依照 01.jsonl, 02.jsonl,... 的格式识别的,不要改动这个规则
338
- """
339
- self.root = XlPath(root)
340
- self.files = []
341
- self.update_subfiles()
342
-
343
- def update_subfiles(self):
344
- self.files = []
345
- for f in self.root.glob_files('*.jsonl'):
346
- if re.match(r'_?\d+$', f.stem): # 目前先用'_?'兼容旧版,但以后应该固定只匹配_\d+
347
- self.files.append(f)
348
-
349
- def __bool__(self):
350
- if self.root.is_dir() and self.files:
351
- return True
352
- else:
353
- return False
354
-
355
- def count_records(self):
356
- total = 0
357
- for f in self.files:
358
- total += f.get_total_lines(skip_blank=True)
359
- return total
360
-
361
- def check(self, title=''):
362
- """ 检查一些数据状态 """
363
- print(title, '文件数:', len(self.files), '条目数:', self.count_records())
364
-
365
- @classmethod
366
- def init_from_file(cls, file, lines_per_file=10000):
367
- """ 从一个jsonl文件初始化一个JsonlDataDir对象 """
368
- file = XlPath(file)
369
- dst_dir = file.parent / file.stem
370
- if not dst_dir.is_dir() and file.is_file():
371
- file.split_to_dir(lines_per_file, dst_dir)
372
- c = cls(dst_dir)
373
- return c
374
-
375
- def _rearrange_group(self, lines_per_file=10000,
376
- group_key=None, sort_mode='keep',
377
- print_mode=1):
378
- # 1 使用sqlite3存储数据和分组信息
379
- # 创建一个临时文件来作为SQLite数据库
380
- temp_db_file = self.root / 'data.sqlite3'
381
- temp_db_file.delete()
382
-
383
- # 使用临时文件创建SQLite数据库连接
384
- conn = sqlite3.connect(temp_db_file)
385
- cursor = conn.cursor()
386
-
387
- # 创建一个临时表来存储jsonl数据
388
- cursor.execute('CREATE TABLE records (id INTEGER PRIMARY KEY AUTOINCREMENT,'
389
- 'data TEXT, group_key TEXT)')
390
- # 给group_key添加索引
391
- cursor.execute('CREATE INDEX idx_group_key ON records(group_key)')
392
-
393
- # 从jsonl文件加载数据到SQLite数据库
394
- commit_interval = 2000 # 多少记录执行一次commit
395
- count = 0
396
- for record in tqdm(self.yield_record(), desc='计算每个record分组', disable=not print_mode):
397
- count += 1
398
- group = group_key(record) if group_key else count
399
- group = str(group)
400
- cursor.execute('INSERT INTO records (data, group_key) VALUES (?, ?)',
401
- (json.dumps(record, ensure_ascii=False), group))
402
- if count % commit_interval == 0:
403
- conn.commit()
404
- conn.commit()
405
-
406
- # 2 查询数据库以进行排序和分组,并将结果写入新的jsonl文件
407
- new_file_count = 0
408
- lines_written = 0
409
- current_file = None
410
- sort_sql = ''
411
- if sort_mode == 'id':
412
- sort_sql = 'ORDER BY id'
413
- elif sort_mode == 'sort':
414
- sort_sql = f'ORDER BY {group_key}'
415
-
416
- for group, in tqdm(cursor.execute('SELECT DISTINCT group_key FROM records').fetchall(),
417
- desc='提取每一组数据',
418
- disable=not print_mode):
419
- query = f'SELECT data FROM records WHERE group_key = ? {sort_sql}'
420
- cursor.execute(query, (group,))
421
-
422
- if current_file is None or lines_written >= lines_per_file:
423
- if current_file:
424
- current_file.close()
425
- new_file_name = f'temp_{new_file_count}.jsonl'
426
- new_file_path = self.root / new_file_name
427
- current_file = new_file_path.open('w', encoding='utf-8')
428
- new_file_count += 1
429
- lines_written = 0
430
-
431
- while True:
432
- row = cursor.fetchone()
433
- if row is None:
434
- break
435
-
436
- current_file.write(row[0] + '\n')
437
- lines_written += 1
438
-
439
- if current_file:
440
- current_file.close()
441
-
442
- # 3 关闭数据库连接并删除临时文件
443
- conn.close()
444
- temp_db_file.delete()
445
-
446
- # 4 删除旧文件,重命名新文件
447
- for f in self.files:
448
- f.delete()
449
-
450
- widths = len(str(new_file_count))
451
- for temp_file in self.root.glob('temp_*.jsonl'):
452
- n = int(re.search(r'\d+', temp_file.name).group())
453
- temp_file.rename(self.root / f'_{n:0{widths}}.jsonl')
454
-
455
- def rearrange(self, lines_per_file=10000, group_key=None,
456
- sort_mode='keep', print_mode=1):
457
- """ 重新整理划分文件
458
-
459
- :param int lines_per_file: 每个文件的行数
460
- :param func group_key: 用来分组的函数,确保相同key的数据会被分到同一个文件里
461
- :param str sort_mode:
462
- keep: 保留原本的相对顺序
463
- id: 按照id的值进行排序
464
- sort: 按照key的值进行排序
465
- """
466
- if group_key is not None or sort_mode != 'keep':
467
- return self._rearrange_group(lines_per_file, group_key, sort_mode, print_mode)
468
-
469
- output_dir = self.root
470
- temp_prefix = 'temp_'
471
-
472
- new_file_count = 0
473
- new_file = None
474
- line_count = 0
475
-
476
- # 计算总行数以确定文件名的前导零数量
477
- total_lines = sum(1 for file in self.files for _ in file.open('r', encoding='utf-8'))
478
- num_digits = len(str((total_lines + lines_per_file - 1) // lines_per_file))
479
-
480
- for file in self.files:
481
- with file.open('r', encoding='utf-8') as f:
482
- for line in f:
483
- if not line.strip():
484
- continue
485
- if line_count == 0:
486
- if new_file is not None:
487
- new_file.close()
488
- new_file_name = f'{temp_prefix}{new_file_count:0{num_digits}d}.jsonl'
489
- new_file_path = output_dir / new_file_name
490
- new_file = new_file_path.open('w', encoding='utf-8')
491
- new_file_count += 1
492
-
493
- new_file.write(line)
494
- line_count += 1
495
-
496
- if line_count == lines_per_file:
497
- line_count = 0
498
-
499
- if new_file is not None:
500
- new_file.close()
501
-
502
- # 删除旧文件
503
- for file in self.files:
504
- os.remove(file)
505
-
506
- # 将临时文件名更改为最终的文件名
507
- for temp_file in output_dir.glob(f'{temp_prefix}*.jsonl'):
508
- final_name = temp_file.name[len(temp_prefix) - 1:]
509
- temp_file.rename(output_dir / final_name)
510
-
511
- def yield_record(self, batch_size=None):
512
- """ 返回数据记录
513
-
514
- :param int batch_size: 每批返回的记录数,如果为None,则逐条返回
515
- """
516
- for i, file in enumerate(self.files):
517
- data = file.read_jsonl()
518
- iterator = iter(data)
519
- while True:
520
- batch = list(islice(iterator, batch_size))
521
- if not batch:
522
- break
523
- if batch_size is None:
524
- yield from batch
525
- else:
526
- yield batch
527
-
528
- def yield_group(self, key, sort_mode='keep'):
529
- """ 分组提取数据
530
-
531
- :param key: 一个函数,对record的映射,通过这个映射规则来分组
532
-
533
- 注意:这个分组只会对每个分文件单独执行,不会全局性质检索
534
- 一般要用self.rearrange对全局的文件进行检索重排后再使用这个函数
535
- """
536
- for filepath in self.files:
537
- jdf = JsonlDataFile(filepath)
538
- yield from jdf.yield_group(key, sort_mode)
539
-
540
- def process_each_file(self, func=None, *,
541
- print_mode=0, desc='process_each_file',
542
- processes_num=1,
543
- subfiles=None,
544
- **kwargs):
545
- # 1 backend
546
- backend = 'loky' if processes_num != 1 else 'sequential'
547
-
548
- # 2 tasks
549
- if subfiles is None:
550
- subfiles = [0, len(self.files)]
551
- elif not isinstance(subfiles, (list, tuple)):
552
- subfiles = [subfiles, subfiles + 1]
553
- a, b = subfiles
554
- tasks = [delayed(func)(file) for file in self.files[a:b]]
555
-
556
- # 3 run
557
- if print_mode:
558
- parallel = Parallel(n_jobs=processes_num, backend=backend, return_as='generator')
559
- list(tqdm(parallel(tasks), total=len(self.files), desc=desc, **kwargs))
560
- else:
561
- parallel = Parallel(n_jobs=processes_num, backend=backend)
562
- parallel(tasks)
563
-
564
- def process_each_record(self, func, *,
565
- inplace=False, reset=False,
566
- print_mode=2, desc=None,
567
- timeout=None,
568
- processes_num=1, threads_num=1,
569
- dst_dir=None, json_encoder=None,
570
- subfiles=None):
571
- """ 封装的对每个record进行操作的函数
572
-
573
- :param func: 外部传入的处理函数
574
- :param inplace: 是否修改原始数据
575
- :param reset: 是否重新处理已经处理过的文件
576
- :param print_mode:
577
- 0 不显示
578
- 1 只显示文件数进度
579
- 2(默认) 显示更详细的文件内处理进度
580
- :param desc: print_mode=1的进度条标题
581
- :param timeout: 超时时间,但有些场合会使用不了(比如linux的子进程里不能使用singal)
582
- 在用不了的场合,可以使用requests自带的timeout等各种机制来限时
583
- :param processes_num: 进程数,每个文件为单独一个进程
584
- :param threads_num: 线程数,每个文件处理的时候使用几个线程
585
- :param dst_dir: 要保存到的目标目录,未设置的时候不保存
586
- :param json_encoder: 有些不是标准的json数据结构,如何进行处理,有需要的时候一般会设置成str
587
- :param subfiles: 只跑部分子文件
588
- a,只有文件编号为a的才运行
589
- [a, b],跑左闭右开的区间内的文件
590
- """
591
- files_num = len(self.files)
592
-
593
- def process_jsonl_file(srcfile):
594
- # 1 如果没有reset,且dstfile存在,则不处理
595
- srcfile = XlPath(srcfile)
596
- if dst_dir:
597
- dstfile = XlPath(dst_dir) / srcfile.name
598
- else:
599
- dstfile = None
600
- if not reset and dstfile and dstfile.is_file():
601
- return
602
-
603
- # 2 跑特定文件里的条目
604
- jdf = JsonlDataFile(srcfile)
605
- new_records = jdf.process_each_record(func,
606
- inplace=inplace,
607
- print_mode=print_mode == 2,
608
- desc=f'{jdf.infile.name}/{files_num}',
609
- timeout=timeout,
610
- threads_num=threads_num,
611
- mininterval=processes_num * 3,
612
- )
613
-
614
- # 3 是否修改原文件,是否保存到dst_dir
615
- if inplace:
616
- jdf.save()
617
-
618
- if dstfile:
619
- jdf = JsonlDataFile()
620
- jdf.records = new_records
621
- jdf.save(dstfile, json_encoder=json_encoder)
622
-
623
- self.process_each_file(process_jsonl_file, subfiles=subfiles,
624
- processes_num=processes_num,
625
- print_mode=print_mode == 1, desc=desc)
626
-
627
- def process_each_group(self, func, group_key, sort_mode='keep', *,
628
- inplace=False, reset=False,
629
- print_mode=1, desc=None,
630
- processes_num=1,
631
- dst_dir=None,
632
- json_encoder=None):
633
- """ 封装的对每组records的处理
634
-
635
- todo 230909周六14:00,还有些细节功能可能不完善,比如内部的进度条,多线程等,等后续使用的时候慢慢优化
636
- """
637
-
638
- def process_jsonl_file(srcfile):
639
- # 1 如果没有reset,且dstfile存在,则不处理
640
- srcfile = XlPath(srcfile)
641
- if dst_dir:
642
- dstfile = XlPath(dst_dir) / srcfile.name
643
- else:
644
- dstfile = None
645
- if not reset and dstfile and dstfile.is_file():
646
- return
647
-
648
- # 2 跑特定文件里的条目
649
- jdf = JsonlDataFile(srcfile)
650
- new_records = []
651
- for records in jdf.yield_group(group_key, sort_mode):
652
- records2 = func(records)
653
- if records2:
654
- new_records.extend(records2)
655
-
656
- # 3 是否修改原文件,是否保存到dst_dir
657
- if inplace:
658
- jdf.records = new_records
659
- jdf.save()
660
-
661
- if dstfile:
662
- jdf = JsonlDataFile()
663
- jdf.records = new_records
664
- jdf.save(dstfile, json_encoder=json_encoder)
665
-
666
- self.process_each_file(process_jsonl_file,
667
- processes_num=processes_num,
668
- print_mode=print_mode == 1, desc=desc)
669
-
670
- def save(self, dst_path=None):
671
- """ 将数据合并到一个jsonl文件中 """
672
- if not dst_path:
673
- dst_path = self.root.parent / f'{self.root.name}.jsonl'
674
- dst_path = XlPath(dst_path)
675
- dst_path.parent.mkdir(parents=True, exist_ok=True)
676
- with dst_path.open('w', encoding='utf8') as f:
677
- for file in tqdm(self.files, desc=f'合并文件并保存 {dst_path.name}'):
678
- with file.open('r', encoding='utf8') as f2:
679
- for line in f2:
680
- if line.strip(): # 不存储空行
681
- f.write(line)
682
-
683
- def clear(self):
684
- for f in self.files:
685
- f.delete()
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # @Author : 陈坤泽
4
+ # @Email : 877362867@qq.com
5
+ # @Date : 2021/06/06 17:46
6
+ import re
7
+
8
+ from pyxllib.prog.pupil import check_install_package
9
+
10
+ check_install_package('joblib', 'joblib>=1.3.2')
11
+
12
+ from collections import OrderedDict
13
+ import sqlite3
14
+
15
+ from joblib import Parallel, delayed
16
+
17
+ from pyxllib.file.specialist.filelib import *
18
+ from pyxllib.file.specialist.dirlib import *
19
+ from pyxllib.file.specialist.download import *
20
+
21
+
22
+ def merge_jsonl(*infiles):
23
+ data = []
24
+ for f in infiles:
25
+ data += XlPath(f).read_jsonl()
26
+ return data
27
+
28
+
29
+ class JsonlDataFile:
30
+ """ 通用的jsonl文件处理类 """
31
+
32
+ def __init__(self, filepath=None, num_records=None):
33
+ """
34
+ 从指定的jsonl文件中读取数据。可以选择读取全部数据或只读取前N条数据。
35
+
36
+ :param str filepath: jsonl文件的路径
37
+ :param int num_records: 指定读取的记录数量,如果为None则读取全部数据
38
+ """
39
+ self.infile = None
40
+ self.records = []
41
+
42
+ if filepath is not None:
43
+ filepath = XlPath(filepath)
44
+ if '?k' in filepath.name: # 如果文件名中有'?',则需要进行模式匹配检索
45
+ new_name = filepath.name.replace('?k', '*')
46
+ filepaths = list(filepath.parent.glob(new_name))
47
+ if filepaths:
48
+ filepath = filepaths[0] # 找到第1个匹配的文件
49
+ self.infile = XlPath(filepath)
50
+ else:
51
+ self.infile = filepath
52
+
53
+ if self.infile and self.infile.is_file(): # 机制上文件也可能不存在的,有可能只是一个预设目录~
54
+ if num_records is None:
55
+ # 读取全部数据
56
+ if self.infile.is_file():
57
+ self.records = self.infile.read_jsonl()
58
+ else:
59
+ # 只读取部分数据
60
+ self.read_partial_records(num_records)
61
+
62
+ def __len__(self):
63
+ return len(self.records)
64
+
65
+ def yield_record(self, start=0, end=None, step=1, batch_size=None):
66
+ """ 返回指定区间的记录
67
+
68
+ :param int start: 起始记录索引,默认为0
69
+ :param int end: 结束记录索引,默认为None(读取到记录末尾)
70
+ :param int step: 步长,默认为1
71
+ :param int batch_size: 每批返回的记录数,如果为None,则逐记录返回
72
+ """
73
+ total_records = len(self.records) # 获取总记录数
74
+
75
+ # 处理负索引
76
+ if start < 0 or (end is not None and end < 0):
77
+ if start < 0:
78
+ start = total_records + start
79
+ if end is not None and end < 0:
80
+ end = total_records + end
81
+
82
+ iterator = islice(self.records, start, end, step)
83
+ while True:
84
+ batch = list(islice(iterator, batch_size))
85
+ if not batch:
86
+ break
87
+ if batch_size is None:
88
+ yield from batch
89
+ else:
90
+ yield batch
91
+
92
+ def yield_group(self, key, sort_mode='keep'):
93
+ """ 分组提取数据
94
+
95
+ :param key: 一个函数,对record的映射,通过这个映射规则来分组
96
+ :param sort_mode:
97
+ keep: 保留原本的相对顺序
98
+ id: 按照id的值进行排序
99
+ sort: 按照key的值进行排序
100
+ """
101
+ # 1 创建一个默认字典来保存分组
102
+ grouped_data = OrderedDict()
103
+
104
+ records = self.records
105
+ if sort_mode == 'id':
106
+ records = sorted(records, key=lambda x: x['id'])
107
+
108
+ # 2 对数据进行分组
109
+ for record in records:
110
+ k = key(record)
111
+ if k not in grouped_data:
112
+ grouped_data[k] = [record]
113
+ else:
114
+ grouped_data[k].append(record)
115
+
116
+ # 3 将分组的数据重新排序并合并为一个新列表
117
+ # 并且在这里可以进行一些分组信息的计算
118
+ if sort_mode == 'sort':
119
+ grouped_data = {k: grouped_data[k] for k in sorted(grouped_data.keys())}
120
+
121
+ # 4 返回分组后的数据
122
+ yield from grouped_data.values()
123
+
124
+ def read_partial_records(self, num_records):
125
+ """ 从jsonl文件中只读取指定数量的记录 """
126
+ if self.infile and self.infile.is_file():
127
+ try:
128
+ lines = next(self.infile.yield_line(batch_size=num_records))
129
+ for line in lines:
130
+ self.records.append(json.loads(line))
131
+ except StopIteration:
132
+ self.records = []
133
+
134
+ def save(self, outfile=None, ensure_ascii=False, json_encoder=None):
135
+ """ 将当前数据保存到指定的jsonl文件中 """
136
+ if outfile is None: # 默认保存回原文件
137
+ outfile = self.infile
138
+ p = XlPath(outfile)
139
+
140
+ # 如果文件名包含'?k',则替换'?'为self.records的数量
141
+ if m := re.search(r'\?k', p.name):
142
+ n = len(self.records)
143
+ if n < 500:
144
+ replace_str = f'{n}' # 数量小于500,直接给出数量
145
+ else:
146
+ v = int(round(n / 1000)) # 数量大于等于500,以"千"为单位'k',四舍五入计算
147
+ replace_str = f'{v}k'
148
+ # 用新字符串替换原来的字符串
149
+ new_name = re.sub(r'\?k', replace_str, p.name)
150
+ p = p.with_name(new_name) # 更改文件名
151
+
152
+ p.parent.mkdir(parents=True, exist_ok=True)
153
+ p.write_jsonl(self.records, ensure_ascii=ensure_ascii, default=json_encoder)
154
+
155
+ def browse_record(self, index=None, paths=None, **kwargs):
156
+ """ 在浏览器中显示指定记录的内容 """
157
+ from pyxllib.prog.specialist import browser
158
+
159
+ # 如果未提供索引,则尝试使用查询参数找到第一个匹配的记录
160
+ if index is None:
161
+ index = self.find_index(paths, **kwargs)
162
+ if index is None:
163
+ raise ValueError('No matching record found')
164
+
165
+ record = self.records[index]
166
+ html_content = ['<html><body><pre>',
167
+ json.dumps(record, ensure_ascii=False, indent=4),
168
+ '</pre></body></html>']
169
+ html_file = (XlPath.tempdir() / f'{self.__class__.__name__}_{index}.html')
170
+ html_file.write_text('\n'.join(html_content))
171
+ browser.html(html_file)
172
+
173
+ def browse_records(self, indices=None, paths=None, **kwargs):
174
+ """ 在浏览器中显示所有匹配的记录 """
175
+ from pyxllib.prog.specialist import browser
176
+
177
+ if indices is None:
178
+ indices = list(self.find_indexs(paths, **kwargs))
179
+ if not indices:
180
+ raise ValueError('No matching records found')
181
+
182
+ html_content = ['<html><body><h1>Matching Records: {}</h1>'.format(len(indices))]
183
+
184
+ for index in indices:
185
+ record = self.records[index]
186
+ html_content.extend([
187
+ '<h2>Record {}</h2>'.format(index),
188
+ '<pre>',
189
+ json.dumps(record, ensure_ascii=False, indent=4),
190
+ '</pre>'
191
+ ])
192
+
193
+ html_content.append('</body></html>')
194
+ html_file = (XlPath.tempdir() / f'{self.__class__.__name__}_matched.html')
195
+ html_file.write_text('\n'.join(html_content))
196
+ browser.html(html_file)
197
+
198
+ def find_indexs(self, paths=None, **kwargs):
199
+ """ 查找满足特定条件的记录的索引,返回所有匹配的结果 """
200
+ paths = paths or {}
201
+
202
+ for i, record in enumerate(self.records):
203
+ # 检查kwargs中的所有条件
204
+ for key, value in kwargs.items():
205
+ if callable(value):
206
+ if not value(record.get(key)):
207
+ break
208
+ elif record.get(key) != value:
209
+ break
210
+ else:
211
+ # 检查paths中的所有条件
212
+ for path, value in paths.items():
213
+ try:
214
+ actual_value = eval(f'record{path}')
215
+ except Exception:
216
+ break
217
+
218
+ if callable(value):
219
+ if not value(actual_value):
220
+ break
221
+ elif actual_value != value:
222
+ break
223
+ else:
224
+ # 如果记录满足所有条件,则返回它的索引
225
+ yield i
226
+
227
+ def find_index(self, paths=None, **kwargs):
228
+ """
229
+ :param dict paths: 在比较复杂场景,无法使用kwargs定位的时候可以用这个规则
230
+ key: 检索范式
231
+ value: 需要满足的值
232
+ 示例: find_index({"['messages'][0]['role']": 'user'})
233
+ :param kwargs: 直接子结点名称和对应的值
234
+ 示例:find_index(id=2023071320000003)
235
+
236
+ 补充说明:
237
+ 1、paths和kwargs可以组合使用,表示必须同时满足二者里限定的所有规则
238
+ 2、value可以写一个 def func(v)->bool的函数,输入对应的value,返回是否满足条件
239
+ """
240
+ return next(self.find_indexs(paths, **kwargs), None)
241
+
242
+ def add_record_basic(self, **kwargs):
243
+ """ 最基础的添加一个条目的接口 """
244
+ record = kwargs
245
+ self.records.append(record)
246
+ return record
247
+
248
+ @classmethod
249
+ def read_from_files(cls, src_files):
250
+ """ 从多个文件中读取并合并数据,并返回新的JsonlDataFile实例 """
251
+ merged_records = []
252
+ for file in src_files:
253
+ jsonl_file = cls(file)
254
+ merged_records.extend(jsonl_file.records)
255
+ # 创建新的实例并返回
256
+ new_instance = cls()
257
+ new_instance.records = merged_records
258
+ return new_instance
259
+
260
+ @classmethod
261
+ def read_from_dir(cls, src_dir):
262
+ """ 从一个目录下的所有jsonl文件中读取并合并数据,并返回新的JsonlDataFile实例 """
263
+ src_dir = XlPath(src_dir)
264
+ src_files = [str(file_path) for file_path in src_dir.glob('*.jsonl')]
265
+ return cls.read_from_files(src_files)
266
+
267
+ def __add__(self, other):
268
+ """ 实现类加法操作,合并两个JsonlDataFile的records """
269
+ if not isinstance(other, JsonlDataFile):
270
+ raise TypeError(f'Unsupported operand type: {type(other)}')
271
+ result = JsonlDataFile()
272
+ result.records = self.records + other.records
273
+ return result
274
+
275
+ def __iadd__(self, other):
276
+ """ 实现原地加法操作,即 += """
277
+ if not isinstance(other, JsonlDataFile):
278
+ raise TypeError(f'Unsupported operand type: {type(other)}')
279
+ self.records += other.records
280
+ return self
281
+
282
+ def process_each_record(self, func, *,
283
+ inplace=False,
284
+ timeout=None,
285
+ print_mode=0,
286
+ threads_num=1,
287
+ **kwargs):
288
+ """ 对records中的每个record应用函数func,可以选择是否在原地修改,以及是否显示进度条
289
+
290
+ :param function func: 对record进行处理的函数,应接受一个record作为参数并返回处理后的record,如果返回None则删除该record
291
+ :param bool inplace: 是否在原地修改records,如果为False(默认),则创建新的JsonlDataFile并返回
292
+ :param int print_mode: 是否显示处理过程的进度条,0表示不显示(默认),1表示显示
293
+ :return JsonlDataFile or None: 如果inplace为False,则返回新的JsonlDataFile,否则返回None
294
+ :param int threads_num: 线程数,默认为1,即单线程
295
+
296
+ 遍历self.records,对每个record执行func函数,如果func返回None,则不包含该record到新的records中。
297
+ """
298
+ backend = 'threading' if threads_num != 1 else 'sequential'
299
+
300
+ if print_mode:
301
+ parallel = Parallel(n_jobs=threads_num, backend=backend,
302
+ timeout=timeout, return_as='generator')
303
+ tasks = [delayed(func)(record) for record in self.records]
304
+ new_records = []
305
+ for y in tqdm(parallel(tasks), total=len(self.records), **kwargs):
306
+ if y:
307
+ new_records.append(y)
308
+ else:
309
+ parallel = Parallel(n_jobs=threads_num, backend=backend, timeout=timeout)
310
+ tasks = [delayed(func)(record) for record in self.records]
311
+ new_records = parallel(tasks)
312
+ new_records = [y for y in new_records if y]
313
+
314
+ if inplace:
315
+ self.records = new_records
316
+
317
+ return new_records
318
+
319
+ def update_each_record(self, func,
320
+ timeout=None,
321
+ print_mode=0,
322
+ threads_num=1):
323
+ """ 遍历并对原始数据进行更改 """
324
+ return self.process_each_record(func,
325
+ inplace=True,
326
+ timeout=timeout,
327
+ print_mode=print_mode,
328
+ threads_num=threads_num)
329
+
330
+
331
+ class JsonlDataDir:
332
+ """ 注意这个类开发目标,应该是尽量去模拟JsonDataFile,让下游工作更好衔接统一 """
333
+
334
+ def __init__(self, root):
335
+ """ 一般用来处理较大的jsonl文件,将其该放到一个目录里,拆分成多个jsonl文件
336
+
337
+ 注意待处理的文件名是依照 01.jsonl, 02.jsonl,... 的格式识别的,不要改动这个规则
338
+ """
339
+ self.root = XlPath(root)
340
+ self.files = []
341
+ self.update_subfiles()
342
+
343
+ def update_subfiles(self):
344
+ self.files = []
345
+ for f in self.root.glob_files('*.jsonl'):
346
+ if re.match(r'_?\d+$', f.stem): # 目前先用'_?'兼容旧版,但以后应该固定只匹配_\d+
347
+ self.files.append(f)
348
+
349
+ def __bool__(self):
350
+ if self.root.is_dir() and self.files:
351
+ return True
352
+ else:
353
+ return False
354
+
355
+ def count_records(self):
356
+ total = 0
357
+ for f in self.files:
358
+ total += f.get_total_lines(skip_blank=True)
359
+ return total
360
+
361
+ def check(self, title=''):
362
+ """ 检查一些数据状态 """
363
+ print(title, '文件数:', len(self.files), '条目数:', self.count_records())
364
+
365
+ @classmethod
366
+ def init_from_file(cls, file, lines_per_file=10000):
367
+ """ 从一个jsonl文件初始化一个JsonlDataDir对象 """
368
+ file = XlPath(file)
369
+ dst_dir = file.parent / file.stem
370
+ if not dst_dir.is_dir() and file.is_file():
371
+ file.split_to_dir(lines_per_file, dst_dir)
372
+ c = cls(dst_dir)
373
+ return c
374
+
375
+ def _rearrange_group(self, lines_per_file=10000,
376
+ group_key=None, sort_mode='keep',
377
+ print_mode=1):
378
+ # 1 使用sqlite3存储数据和分组信息
379
+ # 创建一个临时文件来作为SQLite数据库
380
+ temp_db_file = self.root / 'data.sqlite3'
381
+ temp_db_file.delete()
382
+
383
+ # 使用临时文件创建SQLite数据库连接
384
+ conn = sqlite3.connect(temp_db_file)
385
+ cursor = conn.cursor()
386
+
387
+ # 创建一个临时表来存储jsonl数据
388
+ cursor.execute('CREATE TABLE records (id INTEGER PRIMARY KEY AUTOINCREMENT,'
389
+ 'data TEXT, group_key TEXT)')
390
+ # 给group_key添加索引
391
+ cursor.execute('CREATE INDEX idx_group_key ON records(group_key)')
392
+
393
+ # 从jsonl文件加载数据到SQLite数据库
394
+ commit_interval = 2000 # 多少记录执行一次commit
395
+ count = 0
396
+ for record in tqdm(self.yield_record(), desc='计算每个record分组', disable=not print_mode):
397
+ count += 1
398
+ group = group_key(record) if group_key else count
399
+ group = str(group)
400
+ cursor.execute('INSERT INTO records (data, group_key) VALUES (?, ?)',
401
+ (json.dumps(record, ensure_ascii=False), group))
402
+ if count % commit_interval == 0:
403
+ conn.commit()
404
+ conn.commit()
405
+
406
+ # 2 查询数据库以进行排序和分组,并将结果写入新的jsonl文件
407
+ new_file_count = 0
408
+ lines_written = 0
409
+ current_file = None
410
+ sort_sql = ''
411
+ if sort_mode == 'id':
412
+ sort_sql = 'ORDER BY id'
413
+ elif sort_mode == 'sort':
414
+ sort_sql = f'ORDER BY {group_key}'
415
+
416
+ for group, in tqdm(cursor.execute('SELECT DISTINCT group_key FROM records').fetchall(),
417
+ desc='提取每一组数据',
418
+ disable=not print_mode):
419
+ query = f'SELECT data FROM records WHERE group_key = ? {sort_sql}'
420
+ cursor.execute(query, (group,))
421
+
422
+ if current_file is None or lines_written >= lines_per_file:
423
+ if current_file:
424
+ current_file.close()
425
+ new_file_name = f'temp_{new_file_count}.jsonl'
426
+ new_file_path = self.root / new_file_name
427
+ current_file = new_file_path.open('w', encoding='utf-8')
428
+ new_file_count += 1
429
+ lines_written = 0
430
+
431
+ while True:
432
+ row = cursor.fetchone()
433
+ if row is None:
434
+ break
435
+
436
+ current_file.write(row[0] + '\n')
437
+ lines_written += 1
438
+
439
+ if current_file:
440
+ current_file.close()
441
+
442
+ # 3 关闭数据库连接并删除临时文件
443
+ conn.close()
444
+ temp_db_file.delete()
445
+
446
+ # 4 删除旧文件,重命名新文件
447
+ for f in self.files:
448
+ f.delete()
449
+
450
+ widths = len(str(new_file_count))
451
+ for temp_file in self.root.glob('temp_*.jsonl'):
452
+ n = int(re.search(r'\d+', temp_file.name).group())
453
+ temp_file.rename(self.root / f'_{n:0{widths}}.jsonl')
454
+
455
+ def rearrange(self, lines_per_file=10000, group_key=None,
456
+ sort_mode='keep', print_mode=1):
457
+ """ 重新整理划分文件
458
+
459
+ :param int lines_per_file: 每个文件的行数
460
+ :param func group_key: 用来分组的函数,确保相同key的数据会被分到同一个文件里
461
+ :param str sort_mode:
462
+ keep: 保留原本的相对顺序
463
+ id: 按照id的值进行排序
464
+ sort: 按照key的值进行排序
465
+ """
466
+ if group_key is not None or sort_mode != 'keep':
467
+ return self._rearrange_group(lines_per_file, group_key, sort_mode, print_mode)
468
+
469
+ output_dir = self.root
470
+ temp_prefix = 'temp_'
471
+
472
+ new_file_count = 0
473
+ new_file = None
474
+ line_count = 0
475
+
476
+ # 计算总行数以确定文件名的前导零数量
477
+ total_lines = sum(1 for file in self.files for _ in file.open('r', encoding='utf-8'))
478
+ num_digits = len(str((total_lines + lines_per_file - 1) // lines_per_file))
479
+
480
+ for file in self.files:
481
+ with file.open('r', encoding='utf-8') as f:
482
+ for line in f:
483
+ if not line.strip():
484
+ continue
485
+ if line_count == 0:
486
+ if new_file is not None:
487
+ new_file.close()
488
+ new_file_name = f'{temp_prefix}{new_file_count:0{num_digits}d}.jsonl'
489
+ new_file_path = output_dir / new_file_name
490
+ new_file = new_file_path.open('w', encoding='utf-8')
491
+ new_file_count += 1
492
+
493
+ new_file.write(line)
494
+ line_count += 1
495
+
496
+ if line_count == lines_per_file:
497
+ line_count = 0
498
+
499
+ if new_file is not None:
500
+ new_file.close()
501
+
502
+ # 删除旧文件
503
+ for file in self.files:
504
+ os.remove(file)
505
+
506
+ # 将临时文件名更改为最终的文件名
507
+ for temp_file in output_dir.glob(f'{temp_prefix}*.jsonl'):
508
+ final_name = temp_file.name[len(temp_prefix) - 1:]
509
+ temp_file.rename(output_dir / final_name)
510
+
511
+ def yield_record(self, batch_size=None):
512
+ """ 返回数据记录
513
+
514
+ :param int batch_size: 每批返回的记录数,如果为None,则逐条返回
515
+ """
516
+ for i, file in enumerate(self.files):
517
+ data = file.read_jsonl()
518
+ iterator = iter(data)
519
+ while True:
520
+ batch = list(islice(iterator, batch_size))
521
+ if not batch:
522
+ break
523
+ if batch_size is None:
524
+ yield from batch
525
+ else:
526
+ yield batch
527
+
528
+ def yield_group(self, key, sort_mode='keep'):
529
+ """ 分组提取数据
530
+
531
+ :param key: 一个函数,对record的映射,通过这个映射规则来分组
532
+
533
+ 注意:这个分组只会对每个分文件单独执行,不会全局性质检索
534
+ 一般要用self.rearrange对全局的文件进行检索重排后再使用这个函数
535
+ """
536
+ for filepath in self.files:
537
+ jdf = JsonlDataFile(filepath)
538
+ yield from jdf.yield_group(key, sort_mode)
539
+
540
+ def process_each_file(self, func=None, *,
541
+ print_mode=0, desc='process_each_file',
542
+ processes_num=1,
543
+ subfiles=None,
544
+ **kwargs):
545
+ # 1 backend
546
+ backend = 'loky' if processes_num != 1 else 'sequential'
547
+
548
+ # 2 tasks
549
+ if subfiles is None:
550
+ subfiles = [0, len(self.files)]
551
+ elif not isinstance(subfiles, (list, tuple)):
552
+ subfiles = [subfiles, subfiles + 1]
553
+ a, b = subfiles
554
+ tasks = [delayed(func)(file) for file in self.files[a:b]]
555
+
556
+ # 3 run
557
+ if print_mode:
558
+ parallel = Parallel(n_jobs=processes_num, backend=backend, return_as='generator')
559
+ list(tqdm(parallel(tasks), total=len(self.files), desc=desc, **kwargs))
560
+ else:
561
+ parallel = Parallel(n_jobs=processes_num, backend=backend)
562
+ parallel(tasks)
563
+
564
+ def process_each_record(self, func, *,
565
+ inplace=False, reset=False,
566
+ print_mode=2, desc=None,
567
+ timeout=None,
568
+ processes_num=1, threads_num=1,
569
+ dst_dir=None, json_encoder=None,
570
+ subfiles=None):
571
+ """ 封装的对每个record进行操作的函数
572
+
573
+ :param func: 外部传入的处理函数
574
+ :param inplace: 是否修改原始数据
575
+ :param reset: 是否重新处理已经处理过的文件
576
+ :param print_mode:
577
+ 0 不显示
578
+ 1 只显示文件数进度
579
+ 2(默认) 显示更详细的文件内处理进度
580
+ :param desc: print_mode=1的进度条标题
581
+ :param timeout: 超时时间,但有些场合会使用不了(比如linux的子进程里不能使用singal)
582
+ 在用不了的场合,可以使用requests自带的timeout等各种机制来限时
583
+ :param processes_num: 进程数,每个文件为单独一个进程
584
+ :param threads_num: 线程数,每个文件处理的时候使用几个线程
585
+ :param dst_dir: 要保存到的目标目录,未设置的时候不保存
586
+ :param json_encoder: 有些不是标准的json数据结构,如何进行处理,有需要的时候一般会设置成str
587
+ :param subfiles: 只跑部分子文件
588
+ a,只有文件编号为a的才运行
589
+ [a, b],跑左闭右开的区间内的文件
590
+ """
591
+ files_num = len(self.files)
592
+
593
+ def process_jsonl_file(srcfile):
594
+ # 1 如果没有reset,且dstfile存在,则不处理
595
+ srcfile = XlPath(srcfile)
596
+ if dst_dir:
597
+ dstfile = XlPath(dst_dir) / srcfile.name
598
+ else:
599
+ dstfile = None
600
+ if not reset and dstfile and dstfile.is_file():
601
+ return
602
+
603
+ # 2 跑特定文件里的条目
604
+ jdf = JsonlDataFile(srcfile)
605
+ new_records = jdf.process_each_record(func,
606
+ inplace=inplace,
607
+ print_mode=print_mode == 2,
608
+ desc=f'{jdf.infile.name}/{files_num}',
609
+ timeout=timeout,
610
+ threads_num=threads_num,
611
+ mininterval=processes_num * 3,
612
+ )
613
+
614
+ # 3 是否修改原文件,是否保存到dst_dir
615
+ if inplace:
616
+ jdf.save()
617
+
618
+ if dstfile:
619
+ jdf = JsonlDataFile()
620
+ jdf.records = new_records
621
+ jdf.save(dstfile, json_encoder=json_encoder)
622
+
623
+ self.process_each_file(process_jsonl_file, subfiles=subfiles,
624
+ processes_num=processes_num,
625
+ print_mode=print_mode == 1, desc=desc)
626
+
627
+ def process_each_group(self, func, group_key, sort_mode='keep', *,
628
+ inplace=False, reset=False,
629
+ print_mode=1, desc=None,
630
+ processes_num=1,
631
+ dst_dir=None,
632
+ json_encoder=None):
633
+ """ 封装的对每组records的处理
634
+
635
+ todo 230909周六14:00,还有些细节功能可能不完善,比如内部的进度条,多线程等,等后续使用的时候慢慢优化
636
+ """
637
+
638
+ def process_jsonl_file(srcfile):
639
+ # 1 如果没有reset,且dstfile存在,则不处理
640
+ srcfile = XlPath(srcfile)
641
+ if dst_dir:
642
+ dstfile = XlPath(dst_dir) / srcfile.name
643
+ else:
644
+ dstfile = None
645
+ if not reset and dstfile and dstfile.is_file():
646
+ return
647
+
648
+ # 2 跑特定文件里的条目
649
+ jdf = JsonlDataFile(srcfile)
650
+ new_records = []
651
+ for records in jdf.yield_group(group_key, sort_mode):
652
+ records2 = func(records)
653
+ if records2:
654
+ new_records.extend(records2)
655
+
656
+ # 3 是否修改原文件,是否保存到dst_dir
657
+ if inplace:
658
+ jdf.records = new_records
659
+ jdf.save()
660
+
661
+ if dstfile:
662
+ jdf = JsonlDataFile()
663
+ jdf.records = new_records
664
+ jdf.save(dstfile, json_encoder=json_encoder)
665
+
666
+ self.process_each_file(process_jsonl_file,
667
+ processes_num=processes_num,
668
+ print_mode=print_mode == 1, desc=desc)
669
+
670
+ def save(self, dst_path=None):
671
+ """ 将数据合并到一个jsonl文件中 """
672
+ if not dst_path:
673
+ dst_path = self.root.parent / f'{self.root.name}.jsonl'
674
+ dst_path = XlPath(dst_path)
675
+ dst_path.parent.mkdir(parents=True, exist_ok=True)
676
+ with dst_path.open('w', encoding='utf8') as f:
677
+ for file in tqdm(self.files, desc=f'合并文件并保存 {dst_path.name}'):
678
+ with file.open('r', encoding='utf8') as f2:
679
+ for line in f2:
680
+ if line.strip(): # 不存储空行
681
+ f.write(line)
682
+
683
+ def clear(self):
684
+ for f in self.files:
685
+ f.delete()