re-common 10.0.13__tar.gz → 10.0.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (229) hide show
  1. {re_common-10.0.13 → re_common-10.0.15}/PKG-INFO +1 -1
  2. re_common-10.0.15/re_common/v2/baselibrary/decorators/utils.py +59 -0
  3. re_common-10.0.15/re_common/v2/baselibrary/tools/ac_ahocorasick.py +76 -0
  4. re_common-10.0.15/re_common/v2/baselibrary/tools/hdfs_data_processer.py +318 -0
  5. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/tools/search_hash_tools.py +4 -3
  6. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/tools/text_matcher.py +131 -28
  7. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/utils/BusinessStringUtil.py +21 -7
  8. re_common-10.0.15/re_common/v2/baselibrary/utils/basepika.py +180 -0
  9. re_common-10.0.15/re_common/v2/baselibrary/utils/db.py +38 -0
  10. re_common-10.0.15/re_common/v2/baselibrary/utils/mq.py +83 -0
  11. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/utils/string_bool.py +13 -1
  12. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/utils/string_clear.py +9 -0
  13. re_common-10.0.15/re_common/vip/proxy/__init__.py +0 -0
  14. {re_common-10.0.13 → re_common-10.0.15}/re_common.egg-info/PKG-INFO +1 -1
  15. {re_common-10.0.13 → re_common-10.0.15}/re_common.egg-info/SOURCES.txt +7 -0
  16. {re_common-10.0.13 → re_common-10.0.15}/setup.py +1 -1
  17. {re_common-10.0.13 → re_common-10.0.15}/LICENSE +0 -0
  18. {re_common-10.0.13 → re_common-10.0.15}/README.md +0 -0
  19. {re_common-10.0.13 → re_common-10.0.15}/re_common/__init__.py +0 -0
  20. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/__init__.py +0 -0
  21. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/baseabs/__init__.py +0 -0
  22. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/baseabs/baseabs.py +0 -0
  23. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/database/__init__.py +0 -0
  24. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/database/mbuilder.py +0 -0
  25. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/database/moudle.py +0 -0
  26. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/database/msqlite3.py +0 -0
  27. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/database/mysql.py +0 -0
  28. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/database/sql_factory.py +0 -0
  29. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/mthread/MThreadingRun.py +0 -0
  30. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/mthread/MThreadingRunEvent.py +0 -0
  31. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/mthread/__init__.py +0 -0
  32. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/mthread/mythreading.py +0 -0
  33. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/pakge_other/__init__.py +0 -0
  34. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/pakge_other/socks.py +0 -0
  35. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/readconfig/__init__.py +0 -0
  36. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/readconfig/config_factory.py +0 -0
  37. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/readconfig/ini_config.py +0 -0
  38. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/readconfig/toml_config.py +0 -0
  39. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/temporary/__init__.py +0 -0
  40. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/temporary/envdata.py +0 -0
  41. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/__init__.py +0 -0
  42. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/all_requests/__init__.py +0 -0
  43. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/all_requests/aiohttp_request.py +0 -0
  44. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/all_requests/httpx_requet.py +0 -0
  45. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/all_requests/mrequest.py +0 -0
  46. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/all_requests/requests_request.py +0 -0
  47. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/batch_compre/__init__.py +0 -0
  48. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/batch_compre/bijiao_batch.py +0 -0
  49. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/contrast_db3.py +0 -0
  50. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/copy_file.py +0 -0
  51. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/db3_2_sizedb3.py +0 -0
  52. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/foreachgz.py +0 -0
  53. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/get_attr.py +0 -0
  54. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/image_to_pdf.py +0 -0
  55. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/java_code_deal.py +0 -0
  56. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/javacode.py +0 -0
  57. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/mdb_db3.py +0 -0
  58. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/merge_file.py +0 -0
  59. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/merge_gz_file.py +0 -0
  60. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/mhdfstools/__init__.py +0 -0
  61. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +0 -0
  62. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/mhdfstools/hdfst.py +0 -0
  63. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +0 -0
  64. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/mongo_tools.py +0 -0
  65. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/move_file.py +0 -0
  66. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/move_mongo/__init__.py +0 -0
  67. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +0 -0
  68. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/move_mongo/move_mongo_table.py +0 -0
  69. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/move_mongo/use_mttf.py +0 -0
  70. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/move_mongo/use_mv.py +0 -0
  71. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/mpandas/__init__.py +0 -0
  72. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +0 -0
  73. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/mpandas/pandas_visualization.py +0 -0
  74. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/myparsel.py +0 -0
  75. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/rename_dir_file.py +0 -0
  76. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/sequoiadb_utils.py +0 -0
  77. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/split_line_to_many.py +0 -0
  78. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/stringtodicts.py +0 -0
  79. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/tools/workwechant_bot.py +0 -0
  80. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/__init__.py +0 -0
  81. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseaiohttp.py +0 -0
  82. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseaiomysql.py +0 -0
  83. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseallstep.py +0 -0
  84. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseavro.py +0 -0
  85. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseboto3.py +0 -0
  86. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basecsv.py +0 -0
  87. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basedict.py +0 -0
  88. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basedir.py +0 -0
  89. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseencode.py +0 -0
  90. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseencoding.py +0 -0
  91. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseesdsl.py +0 -0
  92. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseexcel.py +0 -0
  93. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseexcept.py +0 -0
  94. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basefile.py +0 -0
  95. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseftp.py +0 -0
  96. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basegzip.py +0 -0
  97. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basehdfs.py +0 -0
  98. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basehttpx.py +0 -0
  99. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseip.py +0 -0
  100. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basejson.py +0 -0
  101. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baselist.py +0 -0
  102. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basemotor.py +0 -0
  103. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basemssql.py +0 -0
  104. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseodbc.py +0 -0
  105. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basepandas.py +0 -0
  106. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basepeewee.py +0 -0
  107. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basepika.py +0 -0
  108. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basepydash.py +0 -0
  109. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basepymongo.py +0 -0
  110. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basequeue.py +0 -0
  111. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baserar.py +0 -0
  112. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baserequest.py +0 -0
  113. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseset.py +0 -0
  114. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basesmb.py +0 -0
  115. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basestring.py +0 -0
  116. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basetime.py +0 -0
  117. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basetuple.py +0 -0
  118. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/baseurl.py +0 -0
  119. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/basezip.py +0 -0
  120. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/core/__init__.py +0 -0
  121. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/core/bottomutils.py +0 -0
  122. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/core/mdeprecated.py +0 -0
  123. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/core/mlamada.py +0 -0
  124. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/core/msginfo.py +0 -0
  125. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/core/requests_core.py +0 -0
  126. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/fateadm.py +0 -0
  127. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/importfun.py +0 -0
  128. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/mfaker.py +0 -0
  129. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/my_abc/__init__.py +0 -0
  130. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/my_abc/better_abc.py +0 -0
  131. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/mylogger.py +0 -0
  132. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/myredisclient.py +0 -0
  133. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/pipupgrade.py +0 -0
  134. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/ringlist.py +0 -0
  135. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/version_compare.py +0 -0
  136. {re_common-10.0.13 → re_common-10.0.15}/re_common/baselibrary/utils/ydmhttp.py +0 -0
  137. {re_common-10.0.13 → re_common-10.0.15}/re_common/facade/__init__.py +0 -0
  138. {re_common-10.0.13 → re_common-10.0.15}/re_common/facade/lazy_import.py +0 -0
  139. {re_common-10.0.13 → re_common-10.0.15}/re_common/facade/loggerfacade.py +0 -0
  140. {re_common-10.0.13 → re_common-10.0.15}/re_common/facade/mysqlfacade.py +0 -0
  141. {re_common-10.0.13 → re_common-10.0.15}/re_common/facade/now.py +0 -0
  142. {re_common-10.0.13 → re_common-10.0.15}/re_common/facade/sqlite3facade.py +0 -0
  143. {re_common-10.0.13 → re_common-10.0.15}/re_common/facade/use/__init__.py +0 -0
  144. {re_common-10.0.13 → re_common-10.0.15}/re_common/facade/use/mq_use_facade.py +0 -0
  145. {re_common-10.0.13 → re_common-10.0.15}/re_common/facade/use/proxy_use_facade.py +0 -0
  146. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/__init__.py +0 -0
  147. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/base_dict_test.py +0 -0
  148. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/baseavro_test.py +0 -0
  149. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/basefile_test.py +0 -0
  150. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/basemssql_test.py +0 -0
  151. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/baseodbc_test.py +0 -0
  152. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/basepandas_test.py +0 -0
  153. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/get_attr_test/__init__.py +0 -0
  154. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/get_attr_test/get_attr_test_settings.py +0 -0
  155. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/get_attr_test/settings.py +0 -0
  156. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/idencode_test.py +0 -0
  157. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/iniconfig_test.py +0 -0
  158. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/ip_test.py +0 -0
  159. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/merge_file_test.py +0 -0
  160. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/mfaker_test.py +0 -0
  161. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/mm3_test.py +0 -0
  162. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/mylogger_test.py +0 -0
  163. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/myparsel_test.py +0 -0
  164. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/mysql_test.py +0 -0
  165. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/pymongo_test.py +0 -0
  166. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/split_test.py +0 -0
  167. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/sqlite3_merge_test.py +0 -0
  168. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/sqlite3_test.py +0 -0
  169. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/tomlconfig_test.py +0 -0
  170. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/use_tools_test/__init__.py +0 -0
  171. {re_common-10.0.13 → re_common-10.0.15}/re_common/libtest/user/__init__.py +0 -0
  172. {re_common-10.0.13 → re_common-10.0.15}/re_common/studio/__init__.py +0 -0
  173. {re_common-10.0.13 → re_common-10.0.15}/re_common/studio/assignment_expressions.py +0 -0
  174. {re_common-10.0.13 → re_common-10.0.15}/re_common/studio/mydash/__init__.py +0 -0
  175. {re_common-10.0.13 → re_common-10.0.15}/re_common/studio/mydash/test1.py +0 -0
  176. {re_common-10.0.13 → re_common-10.0.15}/re_common/studio/pydashstudio/__init__.py +0 -0
  177. {re_common-10.0.13 → re_common-10.0.15}/re_common/studio/pydashstudio/first.py +0 -0
  178. {re_common-10.0.13 → re_common-10.0.15}/re_common/studio/streamlitstudio/__init__.py +0 -0
  179. {re_common-10.0.13 → re_common-10.0.15}/re_common/studio/streamlitstudio/first_app.py +0 -0
  180. {re_common-10.0.13 → re_common-10.0.15}/re_common/studio/streamlitstudio/uber_pickups.py +0 -0
  181. {re_common-10.0.13 → re_common-10.0.15}/re_common/studio/test.py +0 -0
  182. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/__init__.py +0 -0
  183. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/__init__.py +0 -0
  184. {re_common-10.0.13/re_common/v2/baselibrary/helpers → re_common-10.0.15/re_common/v2/baselibrary/decorators}/__init__.py +0 -0
  185. {re_common-10.0.13/re_common/v2/baselibrary/s3object → re_common-10.0.15/re_common/v2/baselibrary/helpers}/__init__.py +0 -0
  186. {re_common-10.0.13/re_common/v2/baselibrary/tools → re_common-10.0.15/re_common/v2/baselibrary/s3object}/__init__.py +0 -0
  187. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/s3object/baseboto3.py +0 -0
  188. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/tools/WeChatRobot.py +0 -0
  189. {re_common-10.0.13/re_common/v2/baselibrary/utils → re_common-10.0.15/re_common/v2/baselibrary/tools}/__init__.py +0 -0
  190. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/tools/dict_tools.py +0 -0
  191. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/tools/dolphinscheduler.py +0 -0
  192. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/tools/list_tools.py +0 -0
  193. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/tools/unionfind_tools.py +0 -0
  194. {re_common-10.0.13/re_common/vip → re_common-10.0.15/re_common/v2/baselibrary/utils}/__init__.py +0 -0
  195. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/utils/author_smi.py +0 -0
  196. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/utils/basedict.py +0 -0
  197. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/utils/basehdfs.py +0 -0
  198. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/utils/json_cls.py +0 -0
  199. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/utils/n_ary_expression_tree.py +0 -0
  200. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/utils/string_smi.py +0 -0
  201. {re_common-10.0.13 → re_common-10.0.15}/re_common/v2/baselibrary/utils/stringutils.py +0 -0
  202. {re_common-10.0.13/re_common/vip/proxy → re_common-10.0.15/re_common/vip}/__init__.py +0 -0
  203. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/base_step_process.py +0 -0
  204. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/baseencodeid.py +0 -0
  205. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/changetaskname.py +0 -0
  206. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/core_var.py +0 -0
  207. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/mmh3Hash.py +0 -0
  208. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/proxy/allproxys.py +0 -0
  209. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/proxy/allproxys_thread.py +0 -0
  210. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/proxy/cnki_proxy.py +0 -0
  211. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/proxy/kuaidaili.py +0 -0
  212. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/proxy/proxy_all.py +0 -0
  213. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/proxy/update_kuaidaili_0.py +0 -0
  214. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/proxy/wanfang_proxy.py +0 -0
  215. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/proxy/wp_proxy_all.py +0 -0
  216. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/read_rawid_to_txt.py +0 -0
  217. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/title/__init__.py +0 -0
  218. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/title/transform/TransformBookTitleToZt.py +0 -0
  219. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/title/transform/TransformConferenceTitleToZt.py +0 -0
  220. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/title/transform/TransformCstadTitleToZt.py +0 -0
  221. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/title/transform/TransformJournalTitleToZt.py +0 -0
  222. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/title/transform/TransformPatentTitleToZt.py +0 -0
  223. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/title/transform/TransformRegulationTitleToZt.py +0 -0
  224. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/title/transform/TransformStandardTitleToZt.py +0 -0
  225. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/title/transform/TransformThesisTitleToZt.py +0 -0
  226. {re_common-10.0.13 → re_common-10.0.15}/re_common/vip/title/transform/__init__.py +0 -0
  227. {re_common-10.0.13 → re_common-10.0.15}/re_common.egg-info/dependency_links.txt +0 -0
  228. {re_common-10.0.13 → re_common-10.0.15}/re_common.egg-info/top_level.txt +0 -0
  229. {re_common-10.0.13 → re_common-10.0.15}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.13
3
+ Version: 10.0.15
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -0,0 +1,59 @@
1
+ import warnings
2
+ import functools
3
+
4
+ # 全局集合,用于记录已警告的函数或类
5
+ _warned_once = set()
6
+
7
+
8
+ def deprecated(message=None):
9
+ """
10
+ 装饰器:标记函数或类为已废弃,整个进程只发出一次警告。
11
+
12
+ Args:
13
+ message (str): 自定义警告信息,默认为 None。
14
+ """
15
+
16
+ def decorator(obj):
17
+ # 如果是函数
18
+ if isinstance(obj, type(lambda: None)):
19
+ @functools.wraps(obj)
20
+ def wrapper(*args, **kwargs):
21
+ obj_id = id(obj) # 使用对象的内存地址作为唯一标识
22
+ if obj_id not in _warned_once:
23
+ default_msg = f"函数 {obj.__name__} 已不建议使用。"
24
+ warn_msg = f"{default_msg} {message}" if message else default_msg
25
+ warnings.warn(
26
+ warn_msg,
27
+ category=DeprecationWarning,
28
+ stacklevel=2
29
+ )
30
+ _warned_once.add(obj_id) # 记录已警告
31
+ return obj(*args, **kwargs)
32
+
33
+ return wrapper
34
+
35
+ # 如果是类
36
+ elif isinstance(obj, type):
37
+ orig_init = obj.__init__
38
+
39
+ @functools.wraps(orig_init)
40
+ def new_init(self, *args, **kwargs):
41
+ obj_id = id(obj)
42
+ if obj_id not in _warned_once:
43
+ default_msg = f"类 {obj.__name__} 已不建议使用。"
44
+ warn_msg = f"{default_msg} {message}" if message else default_msg
45
+ warnings.warn(
46
+ warn_msg,
47
+ category=DeprecationWarning,
48
+ stacklevel=2
49
+ )
50
+ _warned_once.add(obj_id) # 记录已警告
51
+ orig_init(self, *args, **kwargs)
52
+
53
+ obj.__init__ = new_init
54
+ return obj
55
+
56
+ else:
57
+ raise TypeError("此装饰器仅适用于函数和类")
58
+
59
+ return decorator
@@ -0,0 +1,76 @@
1
+ import pickle
2
+
3
+ import ahocorasick
4
+
5
+
6
+ class ACTool(object):
7
+
8
+ def __init__(self):
9
+ self.automaton = ahocorasick.Automaton()
10
+
11
+ def add_word(self, key, value, overwrite=True) -> bool:
12
+ """
13
+ 为 AC 机添加数据,默认情况下 key重复直接覆盖
14
+ :param key: 要添加的关键字
15
+ :param value: 对应的值
16
+ :param overwrite: 是否覆盖已有的 key,默认为 True
17
+ :return: 是否成功添加或覆盖
18
+ """
19
+ if key in self.automaton: # 检查 key 是否已存在
20
+ if overwrite: # 如果允许覆盖
21
+ self.automaton.add_word(key, value)
22
+ return True
23
+ else: # 不允许覆盖,跳过
24
+ return False
25
+ else: # key 不存在,直接添加
26
+ self.automaton.add_word(key, value)
27
+ return True
28
+
29
+ def is_exists_key(self, key) -> bool:
30
+ # 是否存在key
31
+ if self.automaton.exists(key):
32
+ return True
33
+ else:
34
+ return False
35
+
36
+ def make_automaton(self):
37
+ """
38
+ 添加完词后需要构建
39
+ """
40
+ self.automaton.make_automaton()
41
+
42
+ def iter(self, key):
43
+ """
44
+ 结果为可迭代对象 可通过list 转换 [(end_index, value)]
45
+ tool.add_word("he", "word1")
46
+ tool.add_word("hello", "word2")
47
+
48
+ # 在字符串中查找匹配
49
+ input_string = "hello world"
50
+ matches = list(tool.automaton.iter(input_string))
51
+ print(matches) # [(1, 'word1'), (4, 'word2')]
52
+
53
+ (1, 'word1'):
54
+ end_index = 1: 表示匹配的关键字 "he" 在 input_string = "hello world" 中的结束位置是索引 1(即字符串 "he" 的最后一个字符 'e' 的位置)。
55
+ "hello world" 的索引:h(0)e(1)l(2)l(3)o(4) (5)w(6)o(7)r(8)l(9)d(10)。
56
+ value = 'word1': 表示匹配的关键字 "he" 对应的值是 "word1"。
57
+ (4, 'word2'):
58
+ end_index = 4: 表示匹配的关键字 "hello" 在 input_string = "hello world" 中的结束位置是索引 4(即字符串 "hello" 的最后一个字符 'o' 的位置)。
59
+ value = 'word2': 表示匹配的关键字 "hello" 对应的值是 "word2"。
60
+
61
+ 注意: 结果只会返回 value 不会返回 key,如果需要key 请将key 组合到结果中
62
+ """
63
+
64
+ result_iter = self.automaton.iter(key) # ahocorasick.AutomatonSearchIter
65
+ return result_iter
66
+ def save(self,local_temp_path):
67
+ """
68
+ 将构建好的ac自动机保存到本地
69
+ """
70
+ self.automaton.save(local_temp_path,pickle.dumps)
71
+
72
+ def load(self,local_temp_path):
73
+ """
74
+ 加载已经构建好的ac自动机
75
+ """
76
+ self.automaton=ahocorasick.load(local_temp_path, pickle.loads)
@@ -0,0 +1,318 @@
1
+ import asyncio
2
+ import gzip
3
+ import json
4
+ import sqlite3
5
+ import time
6
+ import os
7
+ from io import BytesIO
8
+ from typing import Callable, Any, List
9
+
10
+ from hdfs import InsecureClient
11
+
12
+
13
+ class HDFSDataProcessor:
14
+ def __init__(
15
+ self,
16
+ hdfs_url="http://VIP-DC-MASTER-2:9870",
17
+ hdfs_user="root",
18
+ db_file="processed_files.db",
19
+ batch_size=50,
20
+ retry_limit=3,
21
+ ):
22
+ self.hdfs_url = hdfs_url
23
+ self.hdfs_user = hdfs_user
24
+ self.db_file = db_file
25
+ self.batch_size = batch_size
26
+ self.retry_limit = retry_limit
27
+ self.client = InsecureClient(self.hdfs_url, user=self.hdfs_user)
28
+ self.read_hdfs_fanc = {"all": self.all_read_gz, "batch": self.batch_read_gz}
29
+ self.read_hdfs_model = "all"
30
+ self.init_db()
31
+
32
+ def init_db(self):
33
+ """初始化 SQLite 数据库"""
34
+ with sqlite3.connect(self.db_file) as conn:
35
+ cursor = conn.cursor()
36
+ cursor.execute("""
37
+ CREATE TABLE IF NOT EXISTS processed_files (
38
+ file_path TEXT PRIMARY KEY
39
+ )
40
+ """)
41
+ conn.commit()
42
+
43
+ def save_processed_file(self, file_path):
44
+ """保存处理过的文件"""
45
+ with sqlite3.connect(self.db_file) as conn:
46
+ cursor = conn.cursor()
47
+ cursor.execute("INSERT OR IGNORE INTO processed_files (file_path) VALUES (?)", (file_path,))
48
+ conn.commit()
49
+
50
+ def is_file_processed(self, file_path):
51
+ """检查文件是否已处理"""
52
+ with sqlite3.connect(self.db_file) as conn:
53
+ cursor = conn.cursor()
54
+ cursor.execute("SELECT file_path FROM processed_files WHERE file_path = ?", (file_path,))
55
+ result = cursor.fetchone()
56
+ return result is not None
57
+
58
+ def list_gz_files(self, hdfs_dir):
59
+ """列出 HDFS 目录中的所有 gzip 文件"""
60
+ return [f"{hdfs_dir}/{file[0]}" for file in self.client.list(hdfs_dir, status=True) if file[0].endswith(".gz")]
61
+
62
+ def count_total_lines(self, gz_file_path: str):
63
+ with self.client.read(gz_file_path) as hdfs_file:
64
+ with gzip.GzipFile(fileobj=hdfs_file) as gz:
65
+ return sum(1 for _ in gz)
66
+
67
+ def batch_read_gz(self, gz_file_path: str):
68
+ """分批读取 gz 文件"""
69
+ with self.client.read(gz_file_path) as hdfs_file:
70
+ with gzip.GzipFile(fileobj=hdfs_file) as gz:
71
+ while True:
72
+ lines = []
73
+ for _ in range(self.batch_size):
74
+ try:
75
+ line = next(gz)
76
+ if line.strip(): # 移除空行
77
+ lines.append(line.decode("utf-8")) # 解码
78
+ except StopIteration: # 文件已读完
79
+ break
80
+ if not lines:
81
+ break
82
+ yield lines
83
+
84
+ def all_read_gz(self, gz_file_path: str, encoding='utf-8'):
85
+ """
86
+ 读取 HDFS 上的 .gz 文件内容。
87
+ :param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
88
+ :param encoding: 文件编码格式(默认 utf-8)
89
+ :return: 文件内容
90
+ """
91
+ with self.client.read(gz_file_path) as reader: # 以二进制模式读取
92
+ compressed_data = reader.read() # 读取压缩数据
93
+ with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
94
+ content = gz_file.read().decode(encoding) # 解码为字符串
95
+ print(f"文件读取成功: {gz_file_path}")
96
+ lines = [i for i in content.splitlines() if i.strip()]
97
+ result = [lines[i:i + self.batch_size] for i in range(0, len(lines), self.batch_size)]
98
+ return result
99
+
100
+ async def process_data(self, data, process_func):
101
+ """处理数据并执行处理函数"""
102
+ retry_count = 0
103
+ while retry_count < self.retry_limit:
104
+ try:
105
+ await process_func(data)
106
+ return # 成功处理后退出
107
+ except Exception as e:
108
+ retry_count += 1
109
+ print(f"处理数据时发生错误: {e}, 正在重试 {retry_count}/{self.retry_limit}, data: {data}")
110
+ await asyncio.sleep(2 ** retry_count)
111
+ print(f"处理数据失败, 达到重试上限, data: {data}")
112
+
113
+ async def process_file(self, hdfs_file_path, process_func):
114
+ """处理单个 gz 文件"""
115
+ total_lines = self.count_total_lines(hdfs_file_path)
116
+ processed_lines = 0
117
+ start_time = time.time()
118
+ # # 这里根据不同的配置选用不同的读取文件的方法
119
+ for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
120
+ processing_start_time = time.time() # 记录本批处理开始时间
121
+
122
+ tasks = []
123
+ for line in lines:
124
+ try:
125
+ data = json.loads(line)
126
+ tasks.append(self.process_data(data, process_func))
127
+ except json.JSONDecodeError as e:
128
+ print(f"解析JSON失败: {e}, 行内容: {line.strip()}")
129
+
130
+ # await AsyncTaskPool(self.batch_size).run(tasks) # AsyncTaskPool 适用于一次提交所有任务, 限制并发数执行
131
+ await asyncio.gather(*tasks)
132
+
133
+ processed_lines += len(lines)
134
+
135
+ elapsed_time = time.time() - start_time # 已用时间
136
+ processing_time = time.time() - processing_start_time # 本次处理时间
137
+ avg_processing_time = (
138
+ (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
139
+ ) # 平均每条数据的处理时间(毫秒)
140
+
141
+ # 估算剩余时间
142
+ remaining_time = (
143
+ ((avg_processing_time / 1000) * (total_lines - processed_lines))
144
+ if processed_lines > 0
145
+ else float("inf")
146
+ )
147
+
148
+ # 显示总进度信息
149
+ print(
150
+ f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
151
+ f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
152
+ f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
153
+ )
154
+
155
+ # 最终进度显示
156
+ final_elapsed_time = time.time() - start_time # 最终已用时间
157
+ print(
158
+ f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
159
+ f"总已用时间: {final_elapsed_time:.2f}秒 | "
160
+ f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
161
+ if processed_lines > 0
162
+ else "处理无数据"
163
+ )
164
+
165
+ self.save_processed_file(hdfs_file_path) # 保存处理过的文件
166
+
167
+ async def retry_process_file(self, hdfs_file_path, process_func):
168
+ """带重试机制的文件处理"""
169
+ retry_count = 0
170
+ while retry_count < self.retry_limit:
171
+ try:
172
+ await self.process_file(hdfs_file_path, process_func)
173
+ return True # 成功处理后退出
174
+ except Exception as e:
175
+ retry_count += 1
176
+ print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
177
+ await asyncio.sleep(2 ** retry_count)
178
+ print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
179
+ return False
180
+ # raise
181
+
182
+ async def batch_process_file(self, hdfs_dir: str, process_func: Callable[[dict], Any]):
183
+ """批量更新所有 gz 文件"""
184
+ gz_files = self.list_gz_files(hdfs_dir)
185
+ all_succeed = True
186
+ for hdfs_file_path in gz_files:
187
+ if self.is_file_processed(hdfs_file_path):
188
+ print(f"跳过已处理文件: {hdfs_file_path}")
189
+ continue # 如果文件已处理,跳过
190
+ succeed = await self.retry_process_file(hdfs_file_path, process_func) # 处理文件
191
+ if succeed is False:
192
+ all_succeed = False
193
+
194
+ if all_succeed:
195
+ # 处理完成后删除数据库文件
196
+ try:
197
+ if os.path.exists(self.db_file):
198
+ os.remove(self.db_file)
199
+ print(f"已删除断点重试文件: {self.db_file}")
200
+ except Exception as e:
201
+ print(f"删除断点重试文件失败: {e}")
202
+
203
+ async def process_file_bulk(self, hdfs_file_path, process_func):
204
+ """按批次处理单个文件,批量数据传递给处理函数"""
205
+ total_lines = self.count_total_lines(hdfs_file_path)
206
+ processed_lines = 0
207
+ start_time = time.time()
208
+
209
+ tasks = []
210
+ # 这里根据不同的配置选用不同的读取文件的方法
211
+ for lines in self.read_hdfs_fanc[self.read_hdfs_model](hdfs_file_path):
212
+ processing_start_time = time.time() # 记录本批处理开始时间
213
+
214
+ batch_data = []
215
+ for line in lines:
216
+ try:
217
+ data = json.loads(line)
218
+ batch_data.append(data)
219
+ except json.JSONDecodeError as e:
220
+ print(f"解析JSON失败: {e}, 行内容: {line.strip()}")
221
+
222
+ # 处理读取到的批次数据
223
+ if batch_data:
224
+ tasks.append(process_func(batch_data)) # 将批次数据传递给处理函数并收集任务
225
+ processed_lines += len(batch_data) # 更新已处理行数
226
+
227
+ # 当积累的任务数量达到 batch_size 时并发处理所有任务
228
+ if len(tasks) >= self.batch_size:
229
+ await asyncio.gather(*tasks) # 同时处理多个批次
230
+
231
+ elapsed_time = time.time() - start_time # 已用时间
232
+ processing_time = time.time() - processing_start_time # 本次处理时间
233
+ avg_processing_time = (
234
+ (elapsed_time * 1000) / processed_lines if processed_lines > 0 else float("inf")
235
+ ) # 平均每条数据的处理时间(毫秒)
236
+
237
+ # 估算剩余时间
238
+ remaining_time = (
239
+ ((avg_processing_time / 1000) * (total_lines - processed_lines))
240
+ if processed_lines > 0
241
+ else float("inf")
242
+ )
243
+
244
+ # 显示总进度信息
245
+ print(
246
+ f"文件: {hdfs_file_path} 总进度: {processed_lines}/{total_lines} 行 | "
247
+ f"已用时间: {elapsed_time:.2f}秒 | 本次处理时间: {processing_time:.2f}秒 | "
248
+ f"预估剩余时间: {remaining_time:.2f}秒 | 平均每条处理时间: {avg_processing_time:.2f}毫秒"
249
+ )
250
+
251
+ # 清空任务列表,准备下一批处理
252
+ tasks.clear()
253
+ # 处理剩余的任务
254
+ if tasks:
255
+ await asyncio.gather(*tasks) # 处理未达到 batch_size 的剩余任务
256
+
257
+ # 最终进度显示
258
+ final_elapsed_time = time.time() - start_time # 最终已用时间
259
+ print(
260
+ f"文件: {hdfs_file_path} 处理完成 | 总进度: {processed_lines}/{total_lines} 行 | "
261
+ f"总已用时间: {final_elapsed_time:.2f}秒 | "
262
+ f"平均每条处理时间: {(final_elapsed_time * 1000) / processed_lines:.2f}毫秒"
263
+ if processed_lines > 0
264
+ else "处理无数据"
265
+ )
266
+
267
+ self.save_processed_file(hdfs_file_path)
268
+
269
+ async def retry_process_file_bulk(self, hdfs_file_path, process_func):
270
+ """带重试机制的批量文件处理"""
271
+ retry_count = 0
272
+ while retry_count < self.retry_limit:
273
+ try:
274
+ await self.process_file_bulk(hdfs_file_path, process_func)
275
+ return True # 成功处理后退出
276
+ except Exception as e:
277
+ retry_count += 1
278
+ print(f"处理文件 {hdfs_file_path} 时发生错误: {e},正在重试 {retry_count}/{self.retry_limit}")
279
+ await asyncio.sleep(2 ** retry_count)
280
+ print(f"处理文件 {hdfs_file_path} 失败,达到重试上限")
281
+ return False
282
+
283
+ async def batch_process_file_bulk(self, hdfs_dir: str, process_func: Callable[[List[dict]], Any]):
284
+ """批量处理 gz 文件中的数据"""
285
+ gz_files = self.list_gz_files(hdfs_dir)
286
+ all_succeed = True
287
+ for hdfs_file_path in gz_files:
288
+ if self.is_file_processed(hdfs_file_path):
289
+ print(f"跳过已处理文件: {hdfs_file_path}")
290
+ continue # 跳过已处理文件
291
+ succeed = await self.retry_process_file_bulk(hdfs_file_path, process_func)
292
+ if succeed is False:
293
+ all_succeed = False
294
+
295
+ if all_succeed:
296
+ # 处理完成后删除数据库文件
297
+ try:
298
+ if os.path.exists(self.db_file):
299
+ os.remove(self.db_file)
300
+ print(f"已删除断点重试文件: {self.db_file}")
301
+ except Exception as e:
302
+ print(f"删除断点重试文件失败: {e}")
303
+
304
+ # # 使用示例
305
+ # async def update_refer(data: dict):
306
+ # ref_id = data["ref_id"]
307
+ # url = f"http://192.168.98.79:8150/v1/fact_refer/update/{ref_id}"
308
+ # update_data = data["update_data"]
309
+ # if not update_data:
310
+ # return
311
+ #
312
+ # # 此处为实际处理逻辑
313
+ # await ApiNetUtils.fetch_post(url=url, payload=update_data)
314
+ #
315
+ #
316
+ # if __name__ == "__main__":
317
+ # processor = HDFSDataProcessor() # 实例化数据处理类
318
+ # asyncio.run(processor.batch_process_file("/user/libaiyun/output/confidence", update_refer))
@@ -3,9 +3,10 @@ from typing import List
3
3
  import jieba
4
4
  from datasketch import MinHash, minhash
5
5
 
6
+ from re_common.v2.baselibrary.decorators.utils import deprecated
6
7
  from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
7
8
 
8
-
9
+ @deprecated("请使用 TextMatcherV2 中的方法代替。")
9
10
  def tokenize(text: str, stopwords=None) -> List[str]:
10
11
  """
11
12
  分词并移除停用词
@@ -32,7 +33,7 @@ def tokenize(text: str, stopwords=None) -> List[str]:
32
33
  words = [w for w in words if w not in stopwords and w.strip()]
33
34
  return words
34
35
 
35
-
36
+ @deprecated("请使用 TextMatcherV2 中的方法代替。")
36
37
  def create_minhash(words: List[str], num_perm=128) -> MinHash:
37
38
  """
38
39
  为分词结果创建 MinHash
@@ -42,7 +43,7 @@ def create_minhash(words: List[str], num_perm=128) -> MinHash:
42
43
  minhash.update(word.encode("utf-8"))
43
44
  return minhash
44
45
 
45
-
46
+ @deprecated("请使用 TextMatcherV2 中的方法代替。")
46
47
  def get_str_minhash(title):
47
48
  from re_common.v2.baselibrary.utils.string_clear import rel_clear
48
49
  rel_title = rel_clear(title)
@@ -1,10 +1,16 @@
1
+ import pickle
2
+
1
3
  import jieba
2
4
  import re
3
- from typing import List, Dict, Tuple, Set, Optional, Union
5
+ from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
4
6
  from datasketch import MinHash, MinHashLSH
5
7
 
8
+ from re_common.v2.baselibrary.decorators.utils import deprecated
9
+ from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
10
+
6
11
 
7
- class TextMatcher:
12
+ @deprecated("请使用 TextMatcherV2 代替。")
13
+ class TextMatcher(object):
8
14
  def __init__(
9
15
  self,
10
16
  threshold: float = 0.5,
@@ -188,36 +194,133 @@ class TextMatcher:
188
194
  self.doc_counter = 0
189
195
 
190
196
 
191
- if __name__ == "__main__":
192
- # 创建匹配器实例
193
- matcher = TextMatcher(
194
- threshold=0.1, # 相似度阈值
195
- num_perm=128, # MinHash 排列数
196
- )
197
+ # 定义一个协议,描述“像鸭子一样”的行为
198
+ class TokenizeDuckLike(Protocol):
199
+ def get_words(self, text) -> List:
200
+ pass
201
+
202
+
203
+ class JiebaTokenize(object):
204
+
205
+ def __init__(self, stopwords=None):
206
+ self.stopwords = stopwords
207
+
208
+ def get_words(self, text) -> List:
209
+
210
+ if self.stopwords is None:
211
+ stopwords = []
212
+ words = jieba.lcut(text)
213
+
214
+ # 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
215
+
216
+ # 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
217
+ def is_singel_en(i):
218
+ if len(i) == 1 and not is_single_cjk_char(i):
219
+ return True
220
+ return False
221
+
222
+ one_char_size = len([i for i in words if is_singel_en(i)])
223
+ all_size = len(words)
224
+ # 如果单字符个数超过一定比例 就直接用空格分词
225
+ if all_size != 0 and one_char_size / all_size > 0.6:
226
+ words = [i for i in text.split() if i.strip()]
227
+
228
+ # 过滤停用词和空字符
229
+ words = [w for w in words if w not in stopwords and w.strip()]
230
+ return words
231
+
232
+
233
+ class TextMatcherV2(object):
234
+
235
+ def __init__(
236
+ self,
237
+ threshold: float = 0.5,
238
+ num_perm: int = 128,
239
+ tdk: TokenizeDuckLike = None
240
+ ):
241
+ """
242
+ 初始化文本匹配器
243
+
244
+ Args:
245
+ threshold: LSH 相似度阈值
246
+ num_perm: MinHash 排列数
247
+ stopwords_path: 停用词文件路径
248
+ user_dict_path: 用户自定义词典路径
249
+ """
250
+ self.threshold = threshold
251
+ self.num_perm = num_perm
252
+ self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
253
+ self.tdk = tdk
254
+
255
+ def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
256
+ if isinstance(minhash, str):
257
+ minhash = self.str_to_minihash(minhash, tdk)
197
258
 
198
- # 添加单个文档
199
- doc_id = matcher.add_document(
200
- "北京是中国的首都"
201
- )
259
+ self.lsh.insert(doc_id, minhash)
202
260
 
203
- # 批量添加文档
204
- docs = {"doc1": "北京是一座现代化的大都市", "doc2": "上海是中国最大的城市", "doc3": "中国的首都是北京"}
205
- matcher.batch_add_documents(docs)
261
+ def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
262
+ def _add_document(minhash_or_str, tdk):
263
+ if isinstance(minhash_or_str, str):
264
+ minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
265
+ self.add_document(docid, minhash_or_str, tdk)
266
+
267
+ if isinstance(betch_data, list):
268
+ # 必须是可解包的2个数据的元组或list
269
+ for docid, minhash_or_str in betch_data:
270
+ _add_document(minhash_or_str, tdk)
271
+ elif isinstance(betch_data, dict):
272
+ for docid, minhash_or_str in betch_data.items():
273
+ _add_document(minhash_or_str, tdk)
274
+ else:
275
+ raise Exception("数据类型错误")
276
+
277
+ def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
278
+ # 使用 LSH 查找候选集
279
+ if isinstance(query_minhash, str):
280
+ query_minhash = self.str_to_minihash(query_minhash, tdk)
281
+ similar_docs = self.lsh.query(query_minhash)
282
+ return similar_docs
206
283
 
207
- # 查找相似文档(不返回相似度分数)
208
- similar_docs = matcher.find_similar("北京首都")
209
- print("相似文档ID:", similar_docs)
284
+ def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
285
+ """
286
+ 为分词结果创建 MinHash
287
+ """
288
+ if num_perm is None:
289
+ num_perm = self.num_perm
290
+ minhash = MinHash(num_perm=num_perm)
291
+ for word in words:
292
+ minhash.update(word.encode("utf-8"))
293
+ return minhash
210
294
 
211
- # 查找相似文档(返回相似度分数)
212
- similar_docs_with_scores = matcher.find_similar("北京首都", return_similarities=True)
213
- print("相似文档ID和分数:", similar_docs_with_scores)
295
+ def create_words(self, text: str, tdk: TokenizeDuckLike = None):
296
+ if tdk is None:
297
+ tdk = self.tdk
298
+ worlds = tdk.get_words(text)
299
+ return worlds
300
+
301
+ def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
302
+ if tdk is None:
303
+ tdk = self.tdk
304
+ words = self.create_words(text, tdk)
305
+ minhash = self.create_minhash(words, self.num_perm)
306
+ return minhash
214
307
 
215
- # 获取原始文本
216
- for doc_id, score in similar_docs_with_scores:
217
- print(f"文档 {doc_id}: {matcher.get_text(doc_id)} (相似度: {score:.2f})")
308
+ def minhash_dumps(self, minhash) -> bytes:
309
+ """
310
+ 序列化
311
+ """
312
+ serialized_minhash = pickle.dumps(minhash)
313
+ return serialized_minhash
218
314
 
219
- # 删除文档
220
- matcher.remove_document("doc1")
315
+ def minhash_loads(self, serialized_minhash) -> MinHash:
316
+ """
317
+ 反序列化
318
+ """
319
+ minhash = pickle.loads(serialized_minhash)
320
+ return minhash
221
321
 
222
- # 清空所有数据
223
- matcher.clear()
322
+ def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
323
+ """
324
+ 在其他地方创建好的lsh 合并进来
325
+ """
326
+ self.lsh.merge(other_minhashlsh)