corp-extractor 0.2.8__tar.gz → 0.9.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {corp_extractor-0.2.8 → corp_extractor-0.9.4}/.gitignore +1 -0
  2. corp_extractor-0.9.4/PKG-INFO +834 -0
  3. corp_extractor-0.9.4/README.md +777 -0
  4. {corp_extractor-0.2.8 → corp_extractor-0.9.4}/pyproject.toml +36 -10
  5. {corp_extractor-0.2.8 → corp_extractor-0.9.4}/src/statement_extractor/__init__.py +12 -1
  6. corp_extractor-0.9.4/src/statement_extractor/cli.py +3127 -0
  7. corp_extractor-0.9.4/src/statement_extractor/data/default_predicates.json +368 -0
  8. corp_extractor-0.9.4/src/statement_extractor/data/statement_taxonomy.json +6972 -0
  9. corp_extractor-0.9.4/src/statement_extractor/database/__init__.py +52 -0
  10. corp_extractor-0.9.4/src/statement_extractor/database/embeddings.py +231 -0
  11. corp_extractor-0.9.4/src/statement_extractor/database/hub.py +470 -0
  12. corp_extractor-0.9.4/src/statement_extractor/database/importers/__init__.py +32 -0
  13. corp_extractor-0.9.4/src/statement_extractor/database/importers/companies_house.py +559 -0
  14. corp_extractor-0.9.4/src/statement_extractor/database/importers/companies_house_officers.py +431 -0
  15. corp_extractor-0.9.4/src/statement_extractor/database/importers/gleif.py +561 -0
  16. corp_extractor-0.9.4/src/statement_extractor/database/importers/import_utils.py +264 -0
  17. corp_extractor-0.9.4/src/statement_extractor/database/importers/sec_edgar.py +392 -0
  18. corp_extractor-0.9.4/src/statement_extractor/database/importers/sec_form4.py +512 -0
  19. corp_extractor-0.9.4/src/statement_extractor/database/importers/wikidata.py +1120 -0
  20. corp_extractor-0.9.4/src/statement_extractor/database/importers/wikidata_dump.py +2282 -0
  21. corp_extractor-0.9.4/src/statement_extractor/database/importers/wikidata_people.py +1174 -0
  22. corp_extractor-0.9.4/src/statement_extractor/database/migrate_v2.py +852 -0
  23. corp_extractor-0.9.4/src/statement_extractor/database/models.py +378 -0
  24. corp_extractor-0.9.4/src/statement_extractor/database/resolver.py +245 -0
  25. corp_extractor-0.9.4/src/statement_extractor/database/schema_v2.py +409 -0
  26. corp_extractor-0.9.4/src/statement_extractor/database/seed_data.py +359 -0
  27. corp_extractor-0.9.4/src/statement_extractor/database/store.py +4825 -0
  28. corp_extractor-0.9.4/src/statement_extractor/document/__init__.py +62 -0
  29. corp_extractor-0.9.4/src/statement_extractor/document/chunker.py +410 -0
  30. corp_extractor-0.9.4/src/statement_extractor/document/context.py +171 -0
  31. corp_extractor-0.9.4/src/statement_extractor/document/deduplicator.py +171 -0
  32. corp_extractor-0.9.4/src/statement_extractor/document/html_extractor.py +246 -0
  33. corp_extractor-0.9.4/src/statement_extractor/document/loader.py +303 -0
  34. corp_extractor-0.9.4/src/statement_extractor/document/pipeline.py +388 -0
  35. corp_extractor-0.9.4/src/statement_extractor/document/summarizer.py +195 -0
  36. {corp_extractor-0.2.8 → corp_extractor-0.9.4}/src/statement_extractor/extractor.py +356 -25
  37. corp_extractor-0.9.4/src/statement_extractor/gliner_extraction.py +218 -0
  38. corp_extractor-0.9.4/src/statement_extractor/llm.py +255 -0
  39. corp_extractor-0.9.4/src/statement_extractor/models/__init__.py +90 -0
  40. corp_extractor-0.9.4/src/statement_extractor/models/canonical.py +182 -0
  41. corp_extractor-0.9.4/src/statement_extractor/models/document.py +308 -0
  42. corp_extractor-0.9.4/src/statement_extractor/models/entity.py +102 -0
  43. corp_extractor-0.9.4/src/statement_extractor/models/labels.py +220 -0
  44. corp_extractor-0.9.4/src/statement_extractor/models/qualifiers.py +139 -0
  45. corp_extractor-0.9.4/src/statement_extractor/models/statement.py +99 -0
  46. {corp_extractor-0.2.8 → corp_extractor-0.9.4}/src/statement_extractor/models.py +43 -2
  47. corp_extractor-0.9.4/src/statement_extractor/pipeline/__init__.py +39 -0
  48. corp_extractor-0.9.4/src/statement_extractor/pipeline/config.py +129 -0
  49. corp_extractor-0.9.4/src/statement_extractor/pipeline/context.py +177 -0
  50. corp_extractor-0.9.4/src/statement_extractor/pipeline/orchestrator.py +416 -0
  51. corp_extractor-0.9.4/src/statement_extractor/pipeline/registry.py +303 -0
  52. corp_extractor-0.9.4/src/statement_extractor/plugins/__init__.py +55 -0
  53. corp_extractor-0.9.4/src/statement_extractor/plugins/base.py +716 -0
  54. corp_extractor-0.9.4/src/statement_extractor/plugins/extractors/__init__.py +13 -0
  55. corp_extractor-0.9.4/src/statement_extractor/plugins/extractors/base.py +9 -0
  56. corp_extractor-0.9.4/src/statement_extractor/plugins/extractors/gliner2.py +546 -0
  57. corp_extractor-0.9.4/src/statement_extractor/plugins/labelers/__init__.py +29 -0
  58. corp_extractor-0.9.4/src/statement_extractor/plugins/labelers/base.py +9 -0
  59. corp_extractor-0.9.4/src/statement_extractor/plugins/labelers/confidence.py +138 -0
  60. corp_extractor-0.9.4/src/statement_extractor/plugins/labelers/relation_type.py +87 -0
  61. corp_extractor-0.9.4/src/statement_extractor/plugins/labelers/sentiment.py +159 -0
  62. corp_extractor-0.9.4/src/statement_extractor/plugins/labelers/taxonomy.py +386 -0
  63. corp_extractor-0.9.4/src/statement_extractor/plugins/labelers/taxonomy_embedding.py +477 -0
  64. corp_extractor-0.9.4/src/statement_extractor/plugins/pdf/__init__.py +10 -0
  65. corp_extractor-0.9.4/src/statement_extractor/plugins/pdf/pypdf.py +291 -0
  66. corp_extractor-0.9.4/src/statement_extractor/plugins/qualifiers/__init__.py +30 -0
  67. corp_extractor-0.9.4/src/statement_extractor/plugins/qualifiers/base.py +9 -0
  68. corp_extractor-0.9.4/src/statement_extractor/plugins/qualifiers/companies_house.py +185 -0
  69. corp_extractor-0.9.4/src/statement_extractor/plugins/qualifiers/embedding_company.py +422 -0
  70. corp_extractor-0.9.4/src/statement_extractor/plugins/qualifiers/gleif.py +197 -0
  71. corp_extractor-0.9.4/src/statement_extractor/plugins/qualifiers/person.py +852 -0
  72. corp_extractor-0.9.4/src/statement_extractor/plugins/qualifiers/sec_edgar.py +209 -0
  73. corp_extractor-0.9.4/src/statement_extractor/plugins/scrapers/__init__.py +10 -0
  74. corp_extractor-0.9.4/src/statement_extractor/plugins/scrapers/http.py +236 -0
  75. corp_extractor-0.9.4/src/statement_extractor/plugins/splitters/__init__.py +13 -0
  76. corp_extractor-0.9.4/src/statement_extractor/plugins/splitters/base.py +9 -0
  77. corp_extractor-0.9.4/src/statement_extractor/plugins/splitters/t5_gemma.py +289 -0
  78. corp_extractor-0.9.4/src/statement_extractor/plugins/taxonomy/__init__.py +13 -0
  79. corp_extractor-0.9.4/src/statement_extractor/plugins/taxonomy/embedding.py +484 -0
  80. corp_extractor-0.9.4/src/statement_extractor/plugins/taxonomy/mnli.py +291 -0
  81. {corp_extractor-0.2.8 → corp_extractor-0.9.4}/src/statement_extractor/predicate_comparer.py +17 -0
  82. {corp_extractor-0.2.8 → corp_extractor-0.9.4}/src/statement_extractor/scoring.py +130 -90
  83. corp_extractor-0.2.8/PKG-INFO +0 -377
  84. corp_extractor-0.2.8/README.md +0 -336
  85. corp_extractor-0.2.8/src/statement_extractor/cli.py +0 -215
  86. {corp_extractor-0.2.8 → corp_extractor-0.9.4}/src/statement_extractor/canonicalization.py +0 -0
@@ -60,3 +60,4 @@ uv.lock
60
60
  # Model files (too large for git)
61
61
  /model/
62
62
  /.claude/settings.local.json
63
+ /statement-extractor-lib/.claude/settings.local.json