PyPI - truthound - Versions diffs - 1.0.8__py3-none-any.whl - Mend

truthound 1.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (877) hide show

truthound/__init__.py +162 -0
truthound/adapters.py +100 -0
truthound/api.py +365 -0
truthound/audit/__init__.py +248 -0
truthound/audit/core.py +967 -0
truthound/audit/filters.py +620 -0
truthound/audit/formatters.py +707 -0
truthound/audit/logger.py +902 -0
truthound/audit/middleware.py +571 -0
truthound/audit/storage.py +1083 -0
truthound/benchmark/__init__.py +123 -0
truthound/benchmark/base.py +757 -0
truthound/benchmark/comparison.py +635 -0
truthound/benchmark/generators.py +706 -0
truthound/benchmark/reporters.py +718 -0
truthound/benchmark/runner.py +635 -0
truthound/benchmark/scenarios.py +712 -0
truthound/cache.py +252 -0
truthound/checkpoint/__init__.py +136 -0
truthound/checkpoint/actions/__init__.py +164 -0
truthound/checkpoint/actions/base.py +324 -0
truthound/checkpoint/actions/custom.py +234 -0
truthound/checkpoint/actions/discord_notify.py +290 -0
truthound/checkpoint/actions/email_notify.py +405 -0
truthound/checkpoint/actions/github_action.py +406 -0
truthound/checkpoint/actions/opsgenie.py +1499 -0
truthound/checkpoint/actions/pagerduty.py +226 -0
truthound/checkpoint/actions/slack_notify.py +233 -0
truthound/checkpoint/actions/store_result.py +249 -0
truthound/checkpoint/actions/teams_notify.py +1570 -0
truthound/checkpoint/actions/telegram_notify.py +419 -0
truthound/checkpoint/actions/update_docs.py +552 -0
truthound/checkpoint/actions/webhook.py +293 -0
truthound/checkpoint/analytics/__init__.py +147 -0
truthound/checkpoint/analytics/aggregations/__init__.py +23 -0
truthound/checkpoint/analytics/aggregations/rollup.py +481 -0
truthound/checkpoint/analytics/aggregations/time_bucket.py +306 -0
truthound/checkpoint/analytics/analyzers/__init__.py +17 -0
truthound/checkpoint/analytics/analyzers/anomaly.py +386 -0
truthound/checkpoint/analytics/analyzers/base.py +270 -0
truthound/checkpoint/analytics/analyzers/forecast.py +421 -0
truthound/checkpoint/analytics/analyzers/trend.py +314 -0
truthound/checkpoint/analytics/models.py +292 -0
truthound/checkpoint/analytics/protocols.py +549 -0
truthound/checkpoint/analytics/service.py +718 -0
truthound/checkpoint/analytics/stores/__init__.py +16 -0
truthound/checkpoint/analytics/stores/base.py +306 -0
truthound/checkpoint/analytics/stores/memory_store.py +353 -0
truthound/checkpoint/analytics/stores/sqlite_store.py +557 -0
truthound/checkpoint/analytics/stores/timescale_store.py +501 -0
truthound/checkpoint/async_actions.py +794 -0
truthound/checkpoint/async_base.py +708 -0
truthound/checkpoint/async_checkpoint.py +617 -0
truthound/checkpoint/async_runner.py +639 -0
truthound/checkpoint/checkpoint.py +527 -0
truthound/checkpoint/ci/__init__.py +61 -0
truthound/checkpoint/ci/detector.py +355 -0
truthound/checkpoint/ci/reporter.py +436 -0
truthound/checkpoint/ci/templates.py +454 -0
truthound/checkpoint/circuitbreaker/__init__.py +133 -0
truthound/checkpoint/circuitbreaker/breaker.py +542 -0
truthound/checkpoint/circuitbreaker/core.py +252 -0
truthound/checkpoint/circuitbreaker/detection.py +459 -0
truthound/checkpoint/circuitbreaker/middleware.py +389 -0
truthound/checkpoint/circuitbreaker/registry.py +357 -0
truthound/checkpoint/distributed/__init__.py +139 -0
truthound/checkpoint/distributed/backends/__init__.py +35 -0
truthound/checkpoint/distributed/backends/celery_backend.py +503 -0
truthound/checkpoint/distributed/backends/kubernetes_backend.py +696 -0
truthound/checkpoint/distributed/backends/local_backend.py +397 -0
truthound/checkpoint/distributed/backends/ray_backend.py +625 -0
truthound/checkpoint/distributed/base.py +774 -0
truthound/checkpoint/distributed/orchestrator.py +765 -0
truthound/checkpoint/distributed/protocols.py +842 -0
truthound/checkpoint/distributed/registry.py +449 -0
truthound/checkpoint/idempotency/__init__.py +120 -0
truthound/checkpoint/idempotency/core.py +295 -0
truthound/checkpoint/idempotency/fingerprint.py +454 -0
truthound/checkpoint/idempotency/locking.py +604 -0
truthound/checkpoint/idempotency/service.py +592 -0
truthound/checkpoint/idempotency/stores.py +653 -0
truthound/checkpoint/monitoring/__init__.py +134 -0
truthound/checkpoint/monitoring/aggregators/__init__.py +15 -0
truthound/checkpoint/monitoring/aggregators/base.py +372 -0
truthound/checkpoint/monitoring/aggregators/realtime.py +300 -0
truthound/checkpoint/monitoring/aggregators/window.py +493 -0
truthound/checkpoint/monitoring/collectors/__init__.py +17 -0
truthound/checkpoint/monitoring/collectors/base.py +257 -0
truthound/checkpoint/monitoring/collectors/memory_collector.py +617 -0
truthound/checkpoint/monitoring/collectors/prometheus_collector.py +451 -0
truthound/checkpoint/monitoring/collectors/redis_collector.py +518 -0
truthound/checkpoint/monitoring/events.py +410 -0
truthound/checkpoint/monitoring/protocols.py +636 -0
truthound/checkpoint/monitoring/service.py +578 -0
truthound/checkpoint/monitoring/views/__init__.py +17 -0
truthound/checkpoint/monitoring/views/base.py +172 -0
truthound/checkpoint/monitoring/views/queue_view.py +220 -0
truthound/checkpoint/monitoring/views/task_view.py +240 -0
truthound/checkpoint/monitoring/views/worker_view.py +263 -0
truthound/checkpoint/registry.py +337 -0
truthound/checkpoint/runner.py +356 -0
truthound/checkpoint/transaction/__init__.py +133 -0
truthound/checkpoint/transaction/base.py +389 -0
truthound/checkpoint/transaction/compensatable.py +537 -0
truthound/checkpoint/transaction/coordinator.py +576 -0
truthound/checkpoint/transaction/executor.py +622 -0
truthound/checkpoint/transaction/idempotency.py +534 -0
truthound/checkpoint/transaction/saga/__init__.py +143 -0
truthound/checkpoint/transaction/saga/builder.py +584 -0
truthound/checkpoint/transaction/saga/definition.py +515 -0
truthound/checkpoint/transaction/saga/event_store.py +542 -0
truthound/checkpoint/transaction/saga/patterns.py +833 -0
truthound/checkpoint/transaction/saga/runner.py +718 -0
truthound/checkpoint/transaction/saga/state_machine.py +793 -0
truthound/checkpoint/transaction/saga/strategies.py +780 -0
truthound/checkpoint/transaction/saga/testing.py +886 -0
truthound/checkpoint/triggers/__init__.py +58 -0
truthound/checkpoint/triggers/base.py +237 -0
truthound/checkpoint/triggers/event.py +385 -0
truthound/checkpoint/triggers/schedule.py +355 -0
truthound/cli.py +2358 -0
truthound/cli_modules/__init__.py +124 -0
truthound/cli_modules/advanced/__init__.py +45 -0
truthound/cli_modules/advanced/benchmark.py +343 -0
truthound/cli_modules/advanced/docs.py +225 -0
truthound/cli_modules/advanced/lineage.py +209 -0
truthound/cli_modules/advanced/ml.py +320 -0
truthound/cli_modules/advanced/realtime.py +196 -0
truthound/cli_modules/checkpoint/__init__.py +46 -0
truthound/cli_modules/checkpoint/init.py +114 -0
truthound/cli_modules/checkpoint/list.py +71 -0
truthound/cli_modules/checkpoint/run.py +159 -0
truthound/cli_modules/checkpoint/validate.py +67 -0
truthound/cli_modules/common/__init__.py +71 -0
truthound/cli_modules/common/errors.py +414 -0
truthound/cli_modules/common/options.py +419 -0
truthound/cli_modules/common/output.py +507 -0
truthound/cli_modules/common/protocol.py +552 -0
truthound/cli_modules/core/__init__.py +48 -0
truthound/cli_modules/core/check.py +123 -0
truthound/cli_modules/core/compare.py +104 -0
truthound/cli_modules/core/learn.py +57 -0
truthound/cli_modules/core/mask.py +77 -0
truthound/cli_modules/core/profile.py +65 -0
truthound/cli_modules/core/scan.py +61 -0
truthound/cli_modules/profiler/__init__.py +51 -0
truthound/cli_modules/profiler/auto_profile.py +175 -0
truthound/cli_modules/profiler/metadata.py +107 -0
truthound/cli_modules/profiler/suite.py +283 -0
truthound/cli_modules/registry.py +431 -0
truthound/cli_modules/scaffolding/__init__.py +89 -0
truthound/cli_modules/scaffolding/base.py +631 -0
truthound/cli_modules/scaffolding/commands.py +545 -0
truthound/cli_modules/scaffolding/plugins.py +1072 -0
truthound/cli_modules/scaffolding/reporters.py +594 -0
truthound/cli_modules/scaffolding/validators.py +1127 -0
truthound/common/__init__.py +18 -0
truthound/common/resilience/__init__.py +130 -0
truthound/common/resilience/bulkhead.py +266 -0
truthound/common/resilience/circuit_breaker.py +516 -0
truthound/common/resilience/composite.py +332 -0
truthound/common/resilience/config.py +292 -0
truthound/common/resilience/protocols.py +217 -0
truthound/common/resilience/rate_limiter.py +404 -0
truthound/common/resilience/retry.py +341 -0
truthound/datadocs/__init__.py +260 -0
truthound/datadocs/base.py +571 -0
truthound/datadocs/builder.py +761 -0
truthound/datadocs/charts.py +764 -0
truthound/datadocs/dashboard/__init__.py +63 -0
truthound/datadocs/dashboard/app.py +576 -0
truthound/datadocs/dashboard/components.py +584 -0
truthound/datadocs/dashboard/state.py +240 -0
truthound/datadocs/engine/__init__.py +46 -0
truthound/datadocs/engine/context.py +376 -0
truthound/datadocs/engine/pipeline.py +618 -0
truthound/datadocs/engine/registry.py +469 -0
truthound/datadocs/exporters/__init__.py +49 -0
truthound/datadocs/exporters/base.py +198 -0
truthound/datadocs/exporters/html.py +178 -0
truthound/datadocs/exporters/json_exporter.py +253 -0
truthound/datadocs/exporters/markdown.py +284 -0
truthound/datadocs/exporters/pdf.py +392 -0
truthound/datadocs/i18n/__init__.py +86 -0
truthound/datadocs/i18n/catalog.py +960 -0
truthound/datadocs/i18n/formatting.py +505 -0
truthound/datadocs/i18n/loader.py +256 -0
truthound/datadocs/i18n/plurals.py +378 -0
truthound/datadocs/renderers/__init__.py +42 -0
truthound/datadocs/renderers/base.py +401 -0
truthound/datadocs/renderers/custom.py +342 -0
truthound/datadocs/renderers/jinja.py +697 -0
truthound/datadocs/sections.py +736 -0
truthound/datadocs/styles.py +931 -0
truthound/datadocs/themes/__init__.py +101 -0
truthound/datadocs/themes/base.py +336 -0
truthound/datadocs/themes/default.py +417 -0
truthound/datadocs/themes/enterprise.py +419 -0
truthound/datadocs/themes/loader.py +336 -0
truthound/datadocs/themes.py +301 -0
truthound/datadocs/transformers/__init__.py +57 -0
truthound/datadocs/transformers/base.py +268 -0
truthound/datadocs/transformers/enrichers.py +544 -0
truthound/datadocs/transformers/filters.py +447 -0
truthound/datadocs/transformers/i18n.py +468 -0
truthound/datadocs/versioning/__init__.py +62 -0
truthound/datadocs/versioning/diff.py +639 -0
truthound/datadocs/versioning/storage.py +497 -0
truthound/datadocs/versioning/version.py +358 -0
truthound/datasources/__init__.py +223 -0
truthound/datasources/_async_protocols.py +222 -0
truthound/datasources/_protocols.py +159 -0
truthound/datasources/adapters.py +428 -0
truthound/datasources/async_base.py +599 -0
truthound/datasources/async_factory.py +511 -0
truthound/datasources/base.py +516 -0
truthound/datasources/factory.py +433 -0
truthound/datasources/nosql/__init__.py +47 -0
truthound/datasources/nosql/base.py +487 -0
truthound/datasources/nosql/elasticsearch.py +801 -0
truthound/datasources/nosql/mongodb.py +636 -0
truthound/datasources/pandas_optimized.py +582 -0
truthound/datasources/pandas_source.py +216 -0
truthound/datasources/polars_source.py +395 -0
truthound/datasources/spark_source.py +479 -0
truthound/datasources/sql/__init__.py +154 -0
truthound/datasources/sql/base.py +710 -0
truthound/datasources/sql/bigquery.py +410 -0
truthound/datasources/sql/cloud_base.py +199 -0
truthound/datasources/sql/databricks.py +471 -0
truthound/datasources/sql/mysql.py +316 -0
truthound/datasources/sql/oracle.py +427 -0
truthound/datasources/sql/postgresql.py +321 -0
truthound/datasources/sql/redshift.py +479 -0
truthound/datasources/sql/snowflake.py +439 -0
truthound/datasources/sql/sqlite.py +286 -0
truthound/datasources/sql/sqlserver.py +437 -0
truthound/datasources/streaming/__init__.py +47 -0
truthound/datasources/streaming/base.py +350 -0
truthound/datasources/streaming/kafka.py +670 -0
truthound/decorators.py +98 -0
truthound/docs/__init__.py +69 -0
truthound/docs/extractor.py +971 -0
truthound/docs/generator.py +601 -0
truthound/docs/parser.py +1037 -0
truthound/docs/renderer.py +999 -0
truthound/drift/__init__.py +22 -0
truthound/drift/compare.py +189 -0
truthound/drift/detectors.py +464 -0
truthound/drift/report.py +160 -0
truthound/execution/__init__.py +65 -0
truthound/execution/_protocols.py +324 -0
truthound/execution/base.py +576 -0
truthound/execution/distributed/__init__.py +179 -0
truthound/execution/distributed/aggregations.py +731 -0
truthound/execution/distributed/arrow_bridge.py +817 -0
truthound/execution/distributed/base.py +550 -0
truthound/execution/distributed/dask_engine.py +976 -0
truthound/execution/distributed/mixins.py +766 -0
truthound/execution/distributed/protocols.py +756 -0
truthound/execution/distributed/ray_engine.py +1127 -0
truthound/execution/distributed/registry.py +446 -0
truthound/execution/distributed/spark_engine.py +1011 -0
truthound/execution/distributed/validator_adapter.py +682 -0
truthound/execution/pandas_engine.py +401 -0
truthound/execution/polars_engine.py +497 -0
truthound/execution/pushdown/__init__.py +230 -0
truthound/execution/pushdown/ast.py +1550 -0
truthound/execution/pushdown/builder.py +1550 -0
truthound/execution/pushdown/dialects.py +1072 -0
truthound/execution/pushdown/executor.py +829 -0
truthound/execution/pushdown/optimizer.py +1041 -0
truthound/execution/sql_engine.py +518 -0
truthound/infrastructure/__init__.py +189 -0
truthound/infrastructure/audit.py +1515 -0
truthound/infrastructure/config.py +1133 -0
truthound/infrastructure/encryption.py +1132 -0
truthound/infrastructure/logging.py +1503 -0
truthound/infrastructure/metrics.py +1220 -0
truthound/lineage/__init__.py +89 -0
truthound/lineage/base.py +746 -0
truthound/lineage/impact_analysis.py +474 -0
truthound/lineage/integrations/__init__.py +22 -0
truthound/lineage/integrations/openlineage.py +548 -0
truthound/lineage/tracker.py +512 -0
truthound/lineage/visualization/__init__.py +33 -0
truthound/lineage/visualization/protocols.py +145 -0
truthound/lineage/visualization/renderers/__init__.py +20 -0
truthound/lineage/visualization/renderers/cytoscape.py +329 -0
truthound/lineage/visualization/renderers/d3.py +331 -0
truthound/lineage/visualization/renderers/graphviz.py +276 -0
truthound/lineage/visualization/renderers/mermaid.py +308 -0
truthound/maskers.py +113 -0
truthound/ml/__init__.py +124 -0
truthound/ml/anomaly_models/__init__.py +31 -0
truthound/ml/anomaly_models/ensemble.py +362 -0
truthound/ml/anomaly_models/isolation_forest.py +444 -0
truthound/ml/anomaly_models/statistical.py +392 -0
truthound/ml/base.py +1178 -0
truthound/ml/drift_detection/__init__.py +26 -0
truthound/ml/drift_detection/concept.py +381 -0
truthound/ml/drift_detection/distribution.py +361 -0
truthound/ml/drift_detection/feature.py +442 -0
truthound/ml/drift_detection/multivariate.py +495 -0
truthound/ml/monitoring/__init__.py +88 -0
truthound/ml/monitoring/alerting/__init__.py +33 -0
truthound/ml/monitoring/alerting/handlers.py +427 -0
truthound/ml/monitoring/alerting/rules.py +508 -0
truthound/ml/monitoring/collectors/__init__.py +19 -0
truthound/ml/monitoring/collectors/composite.py +105 -0
truthound/ml/monitoring/collectors/drift.py +324 -0
truthound/ml/monitoring/collectors/performance.py +179 -0
truthound/ml/monitoring/collectors/quality.py +369 -0
truthound/ml/monitoring/monitor.py +536 -0
truthound/ml/monitoring/protocols.py +451 -0
truthound/ml/monitoring/stores/__init__.py +15 -0
truthound/ml/monitoring/stores/memory.py +201 -0
truthound/ml/monitoring/stores/prometheus.py +296 -0
truthound/ml/rule_learning/__init__.py +25 -0
truthound/ml/rule_learning/constraint_miner.py +443 -0
truthound/ml/rule_learning/pattern_learner.py +499 -0
truthound/ml/rule_learning/profile_learner.py +462 -0
truthound/multitenancy/__init__.py +326 -0
truthound/multitenancy/core.py +852 -0
truthound/multitenancy/integration.py +597 -0
truthound/multitenancy/isolation.py +630 -0
truthound/multitenancy/manager.py +770 -0
truthound/multitenancy/middleware.py +765 -0
truthound/multitenancy/quota.py +537 -0
truthound/multitenancy/resolvers.py +603 -0
truthound/multitenancy/storage.py +703 -0
truthound/observability/__init__.py +307 -0
truthound/observability/context.py +531 -0
truthound/observability/instrumentation.py +611 -0
truthound/observability/logging.py +887 -0
truthound/observability/metrics.py +1157 -0
truthound/observability/tracing/__init__.py +178 -0
truthound/observability/tracing/baggage.py +310 -0
truthound/observability/tracing/config.py +426 -0
truthound/observability/tracing/exporter.py +787 -0
truthound/observability/tracing/integration.py +1018 -0
truthound/observability/tracing/otel/__init__.py +146 -0
truthound/observability/tracing/otel/adapter.py +982 -0
truthound/observability/tracing/otel/bridge.py +1177 -0
truthound/observability/tracing/otel/compat.py +681 -0
truthound/observability/tracing/otel/config.py +691 -0
truthound/observability/tracing/otel/detection.py +327 -0
truthound/observability/tracing/otel/protocols.py +426 -0
truthound/observability/tracing/processor.py +561 -0
truthound/observability/tracing/propagator.py +757 -0
truthound/observability/tracing/provider.py +569 -0
truthound/observability/tracing/resource.py +515 -0
truthound/observability/tracing/sampler.py +487 -0
truthound/observability/tracing/span.py +676 -0
truthound/plugins/__init__.py +198 -0
truthound/plugins/base.py +599 -0
truthound/plugins/cli.py +680 -0
truthound/plugins/dependencies/__init__.py +42 -0
truthound/plugins/dependencies/graph.py +422 -0
truthound/plugins/dependencies/resolver.py +417 -0
truthound/plugins/discovery.py +379 -0
truthound/plugins/docs/__init__.py +46 -0
truthound/plugins/docs/extractor.py +444 -0
truthound/plugins/docs/renderer.py +499 -0
truthound/plugins/enterprise_manager.py +877 -0
truthound/plugins/examples/__init__.py +19 -0
truthound/plugins/examples/custom_validators.py +317 -0
truthound/plugins/examples/slack_notifier.py +312 -0
truthound/plugins/examples/xml_reporter.py +254 -0
truthound/plugins/hooks.py +558 -0
truthound/plugins/lifecycle/__init__.py +43 -0
truthound/plugins/lifecycle/hot_reload.py +402 -0
truthound/plugins/lifecycle/manager.py +371 -0
truthound/plugins/manager.py +736 -0
truthound/plugins/registry.py +338 -0
truthound/plugins/security/__init__.py +93 -0
truthound/plugins/security/exceptions.py +332 -0
truthound/plugins/security/policies.py +348 -0
truthound/plugins/security/protocols.py +643 -0
truthound/plugins/security/sandbox/__init__.py +45 -0
truthound/plugins/security/sandbox/context.py +158 -0
truthound/plugins/security/sandbox/engines/__init__.py +19 -0
truthound/plugins/security/sandbox/engines/container.py +379 -0
truthound/plugins/security/sandbox/engines/noop.py +144 -0
truthound/plugins/security/sandbox/engines/process.py +336 -0
truthound/plugins/security/sandbox/factory.py +211 -0
truthound/plugins/security/signing/__init__.py +57 -0
truthound/plugins/security/signing/service.py +330 -0
truthound/plugins/security/signing/trust_store.py +368 -0
truthound/plugins/security/signing/verifier.py +459 -0
truthound/plugins/versioning/__init__.py +41 -0
truthound/plugins/versioning/constraints.py +297 -0
truthound/plugins/versioning/resolver.py +329 -0
truthound/profiler/__init__.py +1729 -0
truthound/profiler/_lazy.py +452 -0
truthound/profiler/ab_testing/__init__.py +80 -0
truthound/profiler/ab_testing/analysis.py +449 -0
truthound/profiler/ab_testing/base.py +257 -0
truthound/profiler/ab_testing/experiment.py +395 -0
truthound/profiler/ab_testing/tracking.py +368 -0
truthound/profiler/auto_threshold.py +1170 -0
truthound/profiler/base.py +579 -0
truthound/profiler/cache_patterns.py +911 -0
truthound/profiler/caching.py +1303 -0
truthound/profiler/column_profiler.py +712 -0
truthound/profiler/comparison.py +1007 -0
truthound/profiler/custom_patterns.py +1170 -0
truthound/profiler/dashboard/__init__.py +50 -0
truthound/profiler/dashboard/app.py +476 -0
truthound/profiler/dashboard/components.py +457 -0
truthound/profiler/dashboard/config.py +72 -0
truthound/profiler/distributed/__init__.py +83 -0
truthound/profiler/distributed/base.py +281 -0
truthound/profiler/distributed/dask_backend.py +498 -0
truthound/profiler/distributed/local_backend.py +293 -0
truthound/profiler/distributed/profiler.py +304 -0
truthound/profiler/distributed/ray_backend.py +374 -0
truthound/profiler/distributed/spark_backend.py +375 -0
truthound/profiler/distributed.py +1366 -0
truthound/profiler/enterprise_sampling.py +1065 -0
truthound/profiler/errors.py +488 -0
truthound/profiler/evolution/__init__.py +91 -0
truthound/profiler/evolution/alerts.py +426 -0
truthound/profiler/evolution/changes.py +206 -0
truthound/profiler/evolution/compatibility.py +365 -0
truthound/profiler/evolution/detector.py +372 -0
truthound/profiler/evolution/protocols.py +121 -0
truthound/profiler/generators/__init__.py +48 -0
truthound/profiler/generators/base.py +384 -0
truthound/profiler/generators/ml_rules.py +375 -0
truthound/profiler/generators/pattern_rules.py +384 -0
truthound/profiler/generators/schema_rules.py +267 -0
truthound/profiler/generators/stats_rules.py +324 -0
truthound/profiler/generators/suite_generator.py +857 -0
truthound/profiler/i18n.py +1542 -0
truthound/profiler/incremental.py +554 -0
truthound/profiler/incremental_validation.py +1710 -0
truthound/profiler/integration/__init__.py +73 -0
truthound/profiler/integration/adapters.py +345 -0
truthound/profiler/integration/context.py +371 -0
truthound/profiler/integration/executor.py +527 -0
truthound/profiler/integration/naming.py +75 -0
truthound/profiler/integration/protocols.py +243 -0
truthound/profiler/memory.py +1185 -0
truthound/profiler/migration/__init__.py +60 -0
truthound/profiler/migration/base.py +345 -0
truthound/profiler/migration/manager.py +444 -0
truthound/profiler/migration/v1_0_to_v1_1.py +484 -0
truthound/profiler/ml/__init__.py +73 -0
truthound/profiler/ml/base.py +244 -0
truthound/profiler/ml/classifier.py +507 -0
truthound/profiler/ml/feature_extraction.py +604 -0
truthound/profiler/ml/pretrained.py +448 -0
truthound/profiler/ml_inference.py +1276 -0
truthound/profiler/native_patterns.py +815 -0
truthound/profiler/observability.py +1184 -0
truthound/profiler/process_timeout.py +1566 -0
truthound/profiler/progress.py +568 -0
truthound/profiler/progress_callbacks.py +1734 -0
truthound/profiler/quality.py +1345 -0
truthound/profiler/resilience.py +1180 -0
truthound/profiler/sampled_matcher.py +794 -0
truthound/profiler/sampling.py +1288 -0
truthound/profiler/scheduling/__init__.py +82 -0
truthound/profiler/scheduling/protocols.py +214 -0
truthound/profiler/scheduling/scheduler.py +474 -0
truthound/profiler/scheduling/storage.py +457 -0
truthound/profiler/scheduling/triggers.py +449 -0
truthound/profiler/schema.py +603 -0
truthound/profiler/streaming.py +685 -0
truthound/profiler/streaming_patterns.py +1354 -0
truthound/profiler/suite_cli.py +625 -0
truthound/profiler/suite_config.py +789 -0
truthound/profiler/suite_export.py +1268 -0
truthound/profiler/table_profiler.py +547 -0
truthound/profiler/timeout.py +565 -0
truthound/profiler/validation.py +1532 -0
truthound/profiler/visualization/__init__.py +118 -0
truthound/profiler/visualization/base.py +346 -0
truthound/profiler/visualization/generator.py +1259 -0
truthound/profiler/visualization/plotly_renderer.py +811 -0
truthound/profiler/visualization/renderers.py +669 -0
truthound/profiler/visualization/sections.py +540 -0
truthound/profiler/visualization.py +2122 -0
truthound/profiler/yaml_validation.py +1151 -0
truthound/py.typed +0 -0
truthound/ratelimit/__init__.py +248 -0
truthound/ratelimit/algorithms.py +1108 -0
truthound/ratelimit/core.py +573 -0
truthound/ratelimit/integration.py +532 -0
truthound/ratelimit/limiter.py +663 -0
truthound/ratelimit/middleware.py +700 -0
truthound/ratelimit/policy.py +792 -0
truthound/ratelimit/storage.py +763 -0
truthound/rbac/__init__.py +340 -0
truthound/rbac/core.py +976 -0
truthound/rbac/integration.py +760 -0
truthound/rbac/manager.py +1052 -0
truthound/rbac/middleware.py +842 -0
truthound/rbac/policy.py +954 -0
truthound/rbac/storage.py +878 -0
truthound/realtime/__init__.py +141 -0
truthound/realtime/adapters/__init__.py +43 -0
truthound/realtime/adapters/base.py +533 -0
truthound/realtime/adapters/kafka.py +487 -0
truthound/realtime/adapters/kinesis.py +479 -0
truthound/realtime/adapters/mock.py +243 -0
truthound/realtime/base.py +553 -0
truthound/realtime/factory.py +382 -0
truthound/realtime/incremental.py +660 -0
truthound/realtime/processing/__init__.py +67 -0
truthound/realtime/processing/exactly_once.py +575 -0
truthound/realtime/processing/state.py +547 -0
truthound/realtime/processing/windows.py +647 -0
truthound/realtime/protocols.py +569 -0
truthound/realtime/streaming.py +605 -0
truthound/realtime/testing/__init__.py +32 -0
truthound/realtime/testing/containers.py +615 -0
truthound/realtime/testing/fixtures.py +484 -0
truthound/report.py +280 -0
truthound/reporters/__init__.py +46 -0
truthound/reporters/_protocols.py +30 -0
truthound/reporters/base.py +324 -0
truthound/reporters/ci/__init__.py +66 -0
truthound/reporters/ci/azure.py +436 -0
truthound/reporters/ci/base.py +509 -0
truthound/reporters/ci/bitbucket.py +567 -0
truthound/reporters/ci/circleci.py +547 -0
truthound/reporters/ci/detection.py +364 -0
truthound/reporters/ci/factory.py +182 -0
truthound/reporters/ci/github.py +388 -0
truthound/reporters/ci/gitlab.py +471 -0
truthound/reporters/ci/jenkins.py +525 -0
truthound/reporters/console_reporter.py +299 -0
truthound/reporters/factory.py +211 -0
truthound/reporters/html_reporter.py +524 -0
truthound/reporters/json_reporter.py +256 -0
truthound/reporters/markdown_reporter.py +280 -0
truthound/reporters/sdk/__init__.py +174 -0
truthound/reporters/sdk/builder.py +558 -0
truthound/reporters/sdk/mixins.py +1150 -0
truthound/reporters/sdk/schema.py +1493 -0
truthound/reporters/sdk/templates.py +666 -0
truthound/reporters/sdk/testing.py +968 -0
truthound/scanners.py +170 -0
truthound/scheduling/__init__.py +122 -0
truthound/scheduling/cron.py +1136 -0
truthound/scheduling/presets.py +212 -0
truthound/schema.py +275 -0
truthound/secrets/__init__.py +173 -0
truthound/secrets/base.py +618 -0
truthound/secrets/cloud.py +682 -0
truthound/secrets/integration.py +507 -0
truthound/secrets/manager.py +633 -0
truthound/secrets/oidc/__init__.py +172 -0
truthound/secrets/oidc/base.py +902 -0
truthound/secrets/oidc/credential_provider.py +623 -0
truthound/secrets/oidc/exchangers.py +1001 -0
truthound/secrets/oidc/github/__init__.py +110 -0
truthound/secrets/oidc/github/claims.py +718 -0
truthound/secrets/oidc/github/enhanced_provider.py +693 -0
truthound/secrets/oidc/github/trust_policy.py +742 -0
truthound/secrets/oidc/github/verification.py +723 -0
truthound/secrets/oidc/github/workflow.py +691 -0
truthound/secrets/oidc/providers.py +825 -0
truthound/secrets/providers.py +506 -0
truthound/secrets/resolver.py +495 -0
truthound/stores/__init__.py +177 -0
truthound/stores/backends/__init__.py +18 -0
truthound/stores/backends/_protocols.py +340 -0
truthound/stores/backends/azure_blob.py +530 -0
truthound/stores/backends/concurrent_filesystem.py +915 -0
truthound/stores/backends/connection_pool.py +1365 -0
truthound/stores/backends/database.py +743 -0
truthound/stores/backends/filesystem.py +538 -0
truthound/stores/backends/gcs.py +399 -0
truthound/stores/backends/memory.py +354 -0
truthound/stores/backends/s3.py +434 -0
truthound/stores/backpressure/__init__.py +84 -0
truthound/stores/backpressure/base.py +375 -0
truthound/stores/backpressure/circuit_breaker.py +434 -0
truthound/stores/backpressure/monitor.py +376 -0
truthound/stores/backpressure/strategies.py +677 -0
truthound/stores/base.py +551 -0
truthound/stores/batching/__init__.py +65 -0
truthound/stores/batching/base.py +305 -0
truthound/stores/batching/buffer.py +370 -0
truthound/stores/batching/store.py +248 -0
truthound/stores/batching/writer.py +521 -0
truthound/stores/caching/__init__.py +60 -0
truthound/stores/caching/backends.py +684 -0
truthound/stores/caching/base.py +356 -0
truthound/stores/caching/store.py +305 -0
truthound/stores/compression/__init__.py +193 -0
truthound/stores/compression/adaptive.py +694 -0
truthound/stores/compression/base.py +514 -0
truthound/stores/compression/pipeline.py +868 -0
truthound/stores/compression/providers.py +672 -0
truthound/stores/compression/streaming.py +832 -0
truthound/stores/concurrency/__init__.py +81 -0
truthound/stores/concurrency/atomic.py +556 -0
truthound/stores/concurrency/index.py +775 -0
truthound/stores/concurrency/locks.py +576 -0
truthound/stores/concurrency/manager.py +482 -0
truthound/stores/encryption/__init__.py +297 -0
truthound/stores/encryption/base.py +952 -0
truthound/stores/encryption/keys.py +1191 -0
truthound/stores/encryption/pipeline.py +903 -0
truthound/stores/encryption/providers.py +953 -0
truthound/stores/encryption/streaming.py +950 -0
truthound/stores/expectations.py +227 -0
truthound/stores/factory.py +246 -0
truthound/stores/migration/__init__.py +75 -0
truthound/stores/migration/base.py +480 -0
truthound/stores/migration/manager.py +347 -0
truthound/stores/migration/registry.py +382 -0
truthound/stores/migration/store.py +559 -0
truthound/stores/observability/__init__.py +106 -0
truthound/stores/observability/audit.py +718 -0
truthound/stores/observability/config.py +270 -0
truthound/stores/observability/factory.py +208 -0
truthound/stores/observability/metrics.py +636 -0
truthound/stores/observability/protocols.py +410 -0
truthound/stores/observability/store.py +570 -0
truthound/stores/observability/tracing.py +784 -0
truthound/stores/replication/__init__.py +76 -0
truthound/stores/replication/base.py +260 -0
truthound/stores/replication/monitor.py +269 -0
truthound/stores/replication/store.py +439 -0
truthound/stores/replication/syncer.py +391 -0
truthound/stores/results.py +359 -0
truthound/stores/retention/__init__.py +77 -0
truthound/stores/retention/base.py +378 -0
truthound/stores/retention/policies.py +621 -0
truthound/stores/retention/scheduler.py +279 -0
truthound/stores/retention/store.py +526 -0
truthound/stores/streaming/__init__.py +138 -0
truthound/stores/streaming/base.py +801 -0
truthound/stores/streaming/database.py +984 -0
truthound/stores/streaming/filesystem.py +719 -0
truthound/stores/streaming/reader.py +629 -0
truthound/stores/streaming/s3.py +843 -0
truthound/stores/streaming/writer.py +790 -0
truthound/stores/tiering/__init__.py +108 -0
truthound/stores/tiering/base.py +462 -0
truthound/stores/tiering/manager.py +249 -0
truthound/stores/tiering/policies.py +692 -0
truthound/stores/tiering/store.py +526 -0
truthound/stores/versioning/__init__.py +56 -0
truthound/stores/versioning/base.py +376 -0
truthound/stores/versioning/store.py +660 -0
truthound/stores/versioning/strategies.py +353 -0
truthound/types.py +56 -0
truthound/validators/__init__.py +774 -0
truthound/validators/aggregate/__init__.py +27 -0
truthound/validators/aggregate/central.py +116 -0
truthound/validators/aggregate/extremes.py +116 -0
truthound/validators/aggregate/spread.py +118 -0
truthound/validators/aggregate/sum.py +64 -0
truthound/validators/aggregate/type.py +78 -0
truthound/validators/anomaly/__init__.py +93 -0
truthound/validators/anomaly/base.py +431 -0
truthound/validators/anomaly/ml_based.py +1190 -0
truthound/validators/anomaly/multivariate.py +647 -0
truthound/validators/anomaly/statistical.py +599 -0
truthound/validators/base.py +1089 -0
truthound/validators/business_rule/__init__.py +46 -0
truthound/validators/business_rule/base.py +147 -0
truthound/validators/business_rule/checksum.py +509 -0
truthound/validators/business_rule/financial.py +526 -0
truthound/validators/cache.py +733 -0
truthound/validators/completeness/__init__.py +39 -0
truthound/validators/completeness/conditional.py +73 -0
truthound/validators/completeness/default.py +98 -0
truthound/validators/completeness/empty.py +103 -0
truthound/validators/completeness/nan.py +337 -0
truthound/validators/completeness/null.py +152 -0
truthound/validators/cross_table/__init__.py +17 -0
truthound/validators/cross_table/aggregate.py +333 -0
truthound/validators/cross_table/row_count.py +122 -0
truthound/validators/datetime/__init__.py +29 -0
truthound/validators/datetime/format.py +78 -0
truthound/validators/datetime/freshness.py +269 -0
truthound/validators/datetime/order.py +73 -0
truthound/validators/datetime/parseable.py +185 -0
truthound/validators/datetime/range.py +202 -0
truthound/validators/datetime/timezone.py +69 -0
truthound/validators/distribution/__init__.py +49 -0
truthound/validators/distribution/distribution.py +128 -0
truthound/validators/distribution/monotonic.py +119 -0
truthound/validators/distribution/outlier.py +178 -0
truthound/validators/distribution/quantile.py +80 -0
truthound/validators/distribution/range.py +254 -0
truthound/validators/distribution/set.py +125 -0
truthound/validators/distribution/statistical.py +459 -0
truthound/validators/drift/__init__.py +79 -0
truthound/validators/drift/base.py +427 -0
truthound/validators/drift/multi_feature.py +401 -0
truthound/validators/drift/numeric.py +395 -0
truthound/validators/drift/psi.py +446 -0
truthound/validators/drift/statistical.py +510 -0
truthound/validators/enterprise.py +1658 -0
truthound/validators/geospatial/__init__.py +80 -0
truthound/validators/geospatial/base.py +97 -0
truthound/validators/geospatial/boundary.py +238 -0
truthound/validators/geospatial/coordinate.py +351 -0
truthound/validators/geospatial/distance.py +399 -0
truthound/validators/geospatial/polygon.py +665 -0
truthound/validators/i18n/__init__.py +308 -0
truthound/validators/i18n/bidi.py +571 -0
truthound/validators/i18n/catalogs.py +570 -0
truthound/validators/i18n/dialects.py +763 -0
truthound/validators/i18n/extended_catalogs.py +549 -0
truthound/validators/i18n/formatting.py +1434 -0
truthound/validators/i18n/loader.py +1020 -0
truthound/validators/i18n/messages.py +521 -0
truthound/validators/i18n/plural.py +683 -0
truthound/validators/i18n/protocols.py +855 -0
truthound/validators/i18n/tms.py +1162 -0
truthound/validators/localization/__init__.py +53 -0
truthound/validators/localization/base.py +122 -0
truthound/validators/localization/chinese.py +362 -0
truthound/validators/localization/japanese.py +275 -0
truthound/validators/localization/korean.py +524 -0
truthound/validators/memory/__init__.py +94 -0
truthound/validators/memory/approximate_knn.py +506 -0
truthound/validators/memory/base.py +547 -0
truthound/validators/memory/sgd_online.py +719 -0
truthound/validators/memory/streaming_ecdf.py +753 -0
truthound/validators/ml_feature/__init__.py +54 -0
truthound/validators/ml_feature/base.py +249 -0
truthound/validators/ml_feature/correlation.py +299 -0
truthound/validators/ml_feature/leakage.py +344 -0
truthound/validators/ml_feature/null_impact.py +270 -0
truthound/validators/ml_feature/scale.py +264 -0
truthound/validators/multi_column/__init__.py +89 -0
truthound/validators/multi_column/arithmetic.py +284 -0
truthound/validators/multi_column/base.py +231 -0
truthound/validators/multi_column/comparison.py +273 -0
truthound/validators/multi_column/consistency.py +312 -0
truthound/validators/multi_column/statistical.py +299 -0
truthound/validators/optimization/__init__.py +164 -0
truthound/validators/optimization/aggregation.py +563 -0
truthound/validators/optimization/covariance.py +556 -0
truthound/validators/optimization/geo.py +626 -0
truthound/validators/optimization/graph.py +587 -0
truthound/validators/optimization/orchestrator.py +970 -0
truthound/validators/optimization/profiling.py +1312 -0
truthound/validators/privacy/__init__.py +223 -0
truthound/validators/privacy/base.py +635 -0
truthound/validators/privacy/ccpa.py +670 -0
truthound/validators/privacy/gdpr.py +728 -0
truthound/validators/privacy/global_patterns.py +604 -0
truthound/validators/privacy/plugins.py +867 -0
truthound/validators/profiling/__init__.py +52 -0
truthound/validators/profiling/base.py +175 -0
truthound/validators/profiling/cardinality.py +312 -0
truthound/validators/profiling/entropy.py +391 -0
truthound/validators/profiling/frequency.py +455 -0
truthound/validators/pushdown_support.py +660 -0
truthound/validators/query/__init__.py +91 -0
truthound/validators/query/aggregate.py +346 -0
truthound/validators/query/base.py +246 -0
truthound/validators/query/column.py +249 -0
truthound/validators/query/expression.py +274 -0
truthound/validators/query/result.py +323 -0
truthound/validators/query/row_count.py +264 -0
truthound/validators/referential/__init__.py +80 -0
truthound/validators/referential/base.py +395 -0
truthound/validators/referential/cascade.py +391 -0
truthound/validators/referential/circular.py +563 -0
truthound/validators/referential/foreign_key.py +624 -0
truthound/validators/referential/orphan.py +485 -0
truthound/validators/registry.py +112 -0
truthound/validators/schema/__init__.py +41 -0
truthound/validators/schema/column_count.py +142 -0
truthound/validators/schema/column_exists.py +80 -0
truthound/validators/schema/column_order.py +82 -0
truthound/validators/schema/column_pair.py +85 -0
truthound/validators/schema/column_pair_set.py +195 -0
truthound/validators/schema/column_type.py +94 -0
truthound/validators/schema/multi_column.py +53 -0
truthound/validators/schema/multi_column_aggregate.py +175 -0
truthound/validators/schema/referential.py +274 -0
truthound/validators/schema/table_schema.py +91 -0
truthound/validators/schema_validator.py +219 -0
truthound/validators/sdk/__init__.py +250 -0
truthound/validators/sdk/builder.py +680 -0
truthound/validators/sdk/decorators.py +474 -0
truthound/validators/sdk/enterprise/__init__.py +211 -0
truthound/validators/sdk/enterprise/docs.py +725 -0
truthound/validators/sdk/enterprise/fuzzing.py +659 -0
truthound/validators/sdk/enterprise/licensing.py +709 -0
truthound/validators/sdk/enterprise/manager.py +543 -0
truthound/validators/sdk/enterprise/resources.py +628 -0
truthound/validators/sdk/enterprise/sandbox.py +766 -0
truthound/validators/sdk/enterprise/signing.py +603 -0
truthound/validators/sdk/enterprise/templates.py +865 -0
truthound/validators/sdk/enterprise/versioning.py +659 -0
truthound/validators/sdk/templates.py +757 -0
truthound/validators/sdk/testing.py +807 -0
truthound/validators/security/__init__.py +181 -0
truthound/validators/security/redos/__init__.py +182 -0
truthound/validators/security/redos/core.py +861 -0
truthound/validators/security/redos/cpu_monitor.py +593 -0
truthound/validators/security/redos/cve_database.py +791 -0
truthound/validators/security/redos/ml/__init__.py +155 -0
truthound/validators/security/redos/ml/base.py +785 -0
truthound/validators/security/redos/ml/datasets.py +618 -0
truthound/validators/security/redos/ml/features.py +359 -0
truthound/validators/security/redos/ml/models.py +1000 -0
truthound/validators/security/redos/ml/predictor.py +507 -0
truthound/validators/security/redos/ml/storage.py +632 -0
truthound/validators/security/redos/ml/training.py +571 -0
truthound/validators/security/redos/ml_analyzer.py +937 -0
truthound/validators/security/redos/optimizer.py +674 -0
truthound/validators/security/redos/profiler.py +682 -0
truthound/validators/security/redos/re2_engine.py +709 -0
truthound/validators/security/redos.py +886 -0
truthound/validators/security/sql_security.py +1247 -0
truthound/validators/streaming/__init__.py +126 -0
truthound/validators/streaming/base.py +292 -0
truthound/validators/streaming/completeness.py +210 -0
truthound/validators/streaming/mixin.py +575 -0
truthound/validators/streaming/range.py +308 -0
truthound/validators/streaming/sources.py +846 -0
truthound/validators/string/__init__.py +57 -0
truthound/validators/string/casing.py +158 -0
truthound/validators/string/charset.py +96 -0
truthound/validators/string/format.py +501 -0
truthound/validators/string/json.py +77 -0
truthound/validators/string/json_schema.py +184 -0
truthound/validators/string/length.py +104 -0
truthound/validators/string/like_pattern.py +237 -0
truthound/validators/string/regex.py +202 -0
truthound/validators/string/regex_extended.py +435 -0
truthound/validators/table/__init__.py +88 -0
truthound/validators/table/base.py +78 -0
truthound/validators/table/column_count.py +198 -0
truthound/validators/table/freshness.py +362 -0
truthound/validators/table/row_count.py +251 -0
truthound/validators/table/schema.py +333 -0
truthound/validators/table/size.py +285 -0
truthound/validators/timeout/__init__.py +102 -0
truthound/validators/timeout/advanced/__init__.py +247 -0
truthound/validators/timeout/advanced/circuit_breaker.py +675 -0
truthound/validators/timeout/advanced/prediction.py +773 -0
truthound/validators/timeout/advanced/priority.py +618 -0
truthound/validators/timeout/advanced/redis_backend.py +770 -0
truthound/validators/timeout/advanced/retry.py +721 -0
truthound/validators/timeout/advanced/sampling.py +788 -0
truthound/validators/timeout/advanced/sla.py +661 -0
truthound/validators/timeout/advanced/telemetry.py +804 -0
truthound/validators/timeout/cascade.py +477 -0
truthound/validators/timeout/deadline.py +657 -0
truthound/validators/timeout/degradation.py +525 -0
truthound/validators/timeout/distributed.py +597 -0
truthound/validators/timeseries/__init__.py +89 -0
truthound/validators/timeseries/base.py +326 -0
truthound/validators/timeseries/completeness.py +617 -0
truthound/validators/timeseries/gap.py +485 -0
truthound/validators/timeseries/monotonic.py +310 -0
truthound/validators/timeseries/seasonality.py +422 -0
truthound/validators/timeseries/trend.py +510 -0
truthound/validators/uniqueness/__init__.py +59 -0
truthound/validators/uniqueness/approximate.py +475 -0
truthound/validators/uniqueness/distinct_values.py +253 -0
truthound/validators/uniqueness/duplicate.py +118 -0
truthound/validators/uniqueness/primary_key.py +140 -0
truthound/validators/uniqueness/unique.py +191 -0
truthound/validators/uniqueness/within_record.py +599 -0
truthound/validators/utils.py +756 -0
truthound-1.0.8.dist-info/METADATA +474 -0
truthound-1.0.8.dist-info/RECORD +877 -0
truthound-1.0.8.dist-info/WHEEL +4 -0
truthound-1.0.8.dist-info/entry_points.txt +2 -0
truthound-1.0.8.dist-info/licenses/LICENSE +190 -0

truthound/profiler/ml_inference.py ADDED Viewed

@@ -0,0 +1,1276 @@
+"""ML-based type inference beyond pattern matching.
+This module provides machine learning based type inference that considers:
+- Column context (name, position, neighboring columns)
+- Value distribution patterns
+- Semantic relationships
+- Historical learning from user feedback
+Key features:
+- Pluggable model architecture
+- Feature extraction pipeline
+- Online learning support
+- Confidence calibration
+Example:
+    from truthound.profiler.ml_inference import (
+        MLTypeInferrer,
+        ContextFeatureExtractor,
+        create_inference_model,
+    )
+    # Create inferrer with default model
+    inferrer = MLTypeInferrer()
+    # Infer type with context
+    result = inferrer.infer(column, context={
+        "column_name": "email_address",
+        "table_name": "users",
+        "sample_values": ["a@b.com", "c@d.org"],
+    })
+    print(f"Type: {result.inferred_type}, Confidence: {result.confidence:.2%}")
+"""
+from __future__ import annotations
+import hashlib
+import json
+import logging
+import math
+import os
+import pickle
+import re
+import threading
+from abc import ABC, abstractmethod
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Type, Union
+import polars as pl
+from truthound.profiler.base import DataType, ColumnProfile
+logger = logging.getLogger(__name__)
+# =============================================================================
+# Feature Types
+# =============================================================================
+class FeatureType(str, Enum):
+    """Types of features for ML inference."""
+    NAME_BASED = "name_based"
+    VALUE_BASED = "value_based"
+    STATISTICAL = "statistical"
+    CONTEXTUAL = "contextual"
+    PATTERN_BASED = "pattern_based"
+@dataclass
+class Feature:
+    """Single feature for ML model."""
+    name: str
+    value: float
+    feature_type: FeatureType
+    importance: float = 1.0
+    metadata: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class FeatureVector:
+    """Vector of features for a column."""
+    column_name: str
+    features: list[Feature]
+    raw_values: dict[str, Any] = field(default_factory=dict)
+    def to_array(self) -> list[float]:
+        """Convert to numeric array for ML model."""
+        return [f.value for f in self.features]
+    def to_dict(self) -> dict[str, float]:
+        """Convert to named dictionary."""
+        return {f.name: f.value for f in self.features}
+    def get_feature(self, name: str) -> Feature | None:
+        """Get feature by name."""
+        for f in self.features:
+            if f.name == name:
+                return f
+        return None
+# =============================================================================
+# Inference Result
+# =============================================================================
+@dataclass
+class InferenceResult:
+    """Result of ML type inference."""
+    column_name: str
+    inferred_type: DataType
+    confidence: float
+    alternatives: list[tuple[DataType, float]] = field(default_factory=list)
+    reasoning: list[str] = field(default_factory=list)
+    features_used: list[str] = field(default_factory=list)
+    model_version: str = "1.0"
+    inference_time_ms: float = 0.0
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "column_name": self.column_name,
+            "inferred_type": self.inferred_type.value,
+            "confidence": self.confidence,
+            "alternatives": [
+                {"type": t.value, "confidence": c}
+                for t, c in self.alternatives
+            ],
+            "reasoning": self.reasoning,
+            "features_used": self.features_used,
+            "model_version": self.model_version,
+            "inference_time_ms": self.inference_time_ms,
+        }
+# =============================================================================
+# Feature Extractor Protocol
+# =============================================================================
+class FeatureExtractor(ABC):
+    """Abstract base for feature extractors."""
+    name: str = "base"
+    feature_type: FeatureType = FeatureType.VALUE_BASED
+    @abstractmethod
+    def extract(
+        self,
+        column: pl.Series,
+        context: dict[str, Any],
+    ) -> list[Feature]:
+        """Extract features from column.
+        Args:
+            column: Column data
+            context: Additional context (column name, table info, etc.)
+        Returns:
+            List of extracted features
+        """
+        pass
+class NameFeatureExtractor(FeatureExtractor):
+    """Extract features from column names.
+    Uses keyword matching and embedding similarity.
+    """
+    name = "name_features"
+    feature_type = FeatureType.NAME_BASED
+    # Keywords associated with each type
+    TYPE_KEYWORDS: dict[DataType, list[str]] = {
+        DataType.EMAIL: ["email", "mail", "e_mail", "correo"],
+        DataType.PHONE: ["phone", "tel", "mobile", "cell", "fax", "telephone"],
+        DataType.URL: ["url", "link", "href", "website", "uri", "endpoint"],
+        DataType.UUID: ["uuid", "guid", "id", "identifier", "uid"],
+        DataType.DATE: ["date", "day", "birth", "created", "updated", "modified"],
+        DataType.DATETIME: ["datetime", "timestamp", "time", "at", "when"],
+        DataType.INTEGER: ["count", "num", "qty", "quantity", "amount", "total", "id"],
+        DataType.FLOAT: ["price", "rate", "ratio", "percent", "score", "value"],
+        DataType.BOOLEAN: ["is_", "has_", "flag", "active", "enabled", "valid"],
+        DataType.CURRENCY: ["price", "cost", "amount", "fee", "payment", "salary"],
+        DataType.PERCENTAGE: ["percent", "pct", "ratio", "rate"],
+        DataType.KOREAN_PHONE: ["phone", "hp", "tel", "mobile", "연락처", "전화"],
+        DataType.KOREAN_RRN: ["rrn", "resident", "주민", "jumin"],
+        DataType.KOREAN_BUSINESS_NUMBER: ["business", "사업자", "brn"],
+        DataType.CATEGORICAL: ["type", "status", "category", "class", "kind", "level"],
+        DataType.IDENTIFIER: ["id", "key", "code", "no", "number"],
+    }
+    def extract(
+        self,
+        column: pl.Series,
+        context: dict[str, Any],
+    ) -> list[Feature]:
+        features = []
+        col_name = context.get("column_name", column.name or "").lower()
+        # Clean column name
+        clean_name = re.sub(r"[^a-z0-9_]", "_", col_name)
+        tokens = [t for t in clean_name.split("_") if t]
+        # Check each type's keywords
+        for dtype, keywords in self.TYPE_KEYWORDS.items():
+            score = 0.0
+            for keyword in keywords:
+                if keyword in col_name:
+                    score += 1.0
+                elif any(keyword in token for token in tokens):
+                    score += 0.5
+            if score > 0:
+                features.append(Feature(
+                    name=f"name_match_{dtype.value}",
+                    value=min(1.0, score / len(keywords)),
+                    feature_type=self.feature_type,
+                ))
+        # Add general name features
+        features.append(Feature(
+            name="name_length",
+            value=min(1.0, len(col_name) / 50),
+            feature_type=self.feature_type,
+        ))
+        features.append(Feature(
+            name="name_has_underscore",
+            value=1.0 if "_" in col_name else 0.0,
+            feature_type=self.feature_type,
+        ))
+        features.append(Feature(
+            name="name_has_number",
+            value=1.0 if any(c.isdigit() for c in col_name) else 0.0,
+            feature_type=self.feature_type,
+        ))
+        return features
+class ValueFeatureExtractor(FeatureExtractor):
+    """Extract features from actual values."""
+    name = "value_features"
+    feature_type = FeatureType.VALUE_BASED
+    def extract(
+        self,
+        column: pl.Series,
+        context: dict[str, Any],
+    ) -> list[Feature]:
+        features = []
+        # Sample values for analysis
+        sample_size = min(1000, len(column))
+        sample = column.drop_nulls().head(sample_size)
+        if len(sample) == 0:
+            return [Feature(
+                name="all_null",
+                value=1.0,
+                feature_type=self.feature_type,
+            )]
+        # String analysis
+        if column.dtype == pl.Utf8:
+            str_features = self._extract_string_features(sample)
+            features.extend(str_features)
+        # Numeric analysis
+        elif column.dtype in [pl.Int32, pl.Int64, pl.Float32, pl.Float64]:
+            num_features = self._extract_numeric_features(sample)
+            features.extend(num_features)
+        # Boolean
+        elif column.dtype == pl.Boolean:
+            features.append(Feature(
+                name="is_boolean",
+                value=1.0,
+                feature_type=self.feature_type,
+            ))
+        # General features
+        features.append(Feature(
+            name="null_ratio",
+            value=column.null_count() / len(column) if len(column) > 0 else 0,
+            feature_type=self.feature_type,
+        ))
+        features.append(Feature(
+            name="unique_ratio",
+            value=column.n_unique() / len(column) if len(column) > 0 else 0,
+            feature_type=self.feature_type,
+        ))
+        return features
+    def _extract_string_features(self, sample: pl.Series) -> list[Feature]:
+        """Extract features from string values."""
+        features = []
+        # Length statistics
+        lengths = sample.str.len_chars()
+        avg_len = lengths.mean() or 0
+        std_len = lengths.std() or 0
+        features.append(Feature(
+            name="avg_string_length",
+            value=min(1.0, avg_len / 100),
+            feature_type=self.feature_type,
+        ))
+        features.append(Feature(
+            name="length_variance",
+            value=min(1.0, std_len / avg_len) if avg_len > 0 else 0,
+            feature_type=self.feature_type,
+        ))
+        # Character type ratios
+        sample_str = sample.to_list()[:100]  # Limit for performance
+        has_at = sum(1 for s in sample_str if "@" in str(s)) / len(sample_str)
+        has_dot = sum(1 for s in sample_str if "." in str(s)) / len(sample_str)
+        has_slash = sum(1 for s in sample_str if "/" in str(s)) / len(sample_str)
+        has_dash = sum(1 for s in sample_str if "-" in str(s)) / len(sample_str)
+        has_colon = sum(1 for s in sample_str if ":" in str(s)) / len(sample_str)
+        features.extend([
+            Feature(name="has_at_sign", value=has_at, feature_type=self.feature_type),
+            Feature(name="has_dot", value=has_dot, feature_type=self.feature_type),
+            Feature(name="has_slash", value=has_slash, feature_type=self.feature_type),
+            Feature(name="has_dash", value=has_dash, feature_type=self.feature_type),
+            Feature(name="has_colon", value=has_colon, feature_type=self.feature_type),
+        ])
+        # Digit ratio
+        digit_ratios = []
+        for s in sample_str:
+            s = str(s)
+            if len(s) > 0:
+                digit_ratios.append(sum(c.isdigit() for c in s) / len(s))
+        avg_digit_ratio = sum(digit_ratios) / len(digit_ratios) if digit_ratios else 0
+        features.append(Feature(
+            name="digit_ratio",
+            value=avg_digit_ratio,
+            feature_type=self.feature_type,
+        ))
+        # Check for common patterns
+        email_pattern = sum(1 for s in sample_str if re.match(r"^[^@]+@[^@]+\.[^@]+$", str(s))) / len(sample_str)
+        uuid_pattern = sum(1 for s in sample_str if re.match(
+            r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$",
+            str(s)
+        )) / len(sample_str)
+        features.extend([
+            Feature(name="email_pattern_ratio", value=email_pattern, feature_type=self.feature_type),
+            Feature(name="uuid_pattern_ratio", value=uuid_pattern, feature_type=self.feature_type),
+        ])
+        return features
+    def _extract_numeric_features(self, sample: pl.Series) -> list[Feature]:
+        """Extract features from numeric values."""
+        features = []
+        # Basic stats
+        min_val = sample.min() or 0
+        max_val = sample.max() or 0
+        mean_val = sample.mean() or 0
+        std_val = sample.std() or 0
+        # Range features
+        range_val = max_val - min_val
+        features.append(Feature(
+            name="numeric_range_log",
+            value=math.log10(range_val + 1) / 10,  # Normalize
+            feature_type=self.feature_type,
+        ))
+        # Check if values look like IDs (sequential integers)
+        if sample.dtype in [pl.Int32, pl.Int64]:
+            sorted_sample = sample.sort()
+            diffs = sorted_sample.diff().drop_nulls()
+            is_sequential = (diffs == 1).mean() if len(diffs) > 0 else 0
+            features.append(Feature(
+                name="is_sequential",
+                value=is_sequential or 0,
+                feature_type=self.feature_type,
+            ))
+        # Check for percentage-like values (0-100 or 0-1)
+        in_0_1 = ((sample >= 0) & (sample <= 1)).mean()
+        in_0_100 = ((sample >= 0) & (sample <= 100)).mean()
+        features.extend([
+            Feature(name="in_0_1_range", value=in_0_1 or 0, feature_type=self.feature_type),
+            Feature(name="in_0_100_range", value=in_0_100 or 0, feature_type=self.feature_type),
+        ])
+        # Check for currency-like (2 decimal places)
+        if sample.dtype in [pl.Float32, pl.Float64]:
+            decimal_places = []
+            for v in sample.head(100).to_list():
+                if v is not None:
+                    s = f"{v:.10f}".rstrip("0")
+                    if "." in s:
+                        decimal_places.append(len(s.split(".")[1]))
+            if decimal_places:
+                avg_decimals = sum(decimal_places) / len(decimal_places)
+                is_currency_like = 1.0 if 1.5 <= avg_decimals <= 2.5 else 0.0
+                features.append(Feature(
+                    name="is_currency_like",
+                    value=is_currency_like,
+                    feature_type=self.feature_type,
+                ))
+        return features
+class StatisticalFeatureExtractor(FeatureExtractor):
+    """Extract statistical distribution features."""
+    name = "statistical_features"
+    feature_type = FeatureType.STATISTICAL
+    def extract(
+        self,
+        column: pl.Series,
+        context: dict[str, Any],
+    ) -> list[Feature]:
+        features = []
+        non_null = column.drop_nulls()
+        if len(non_null) == 0:
+            return features
+        # Cardinality
+        n_unique = non_null.n_unique()
+        n_total = len(non_null)
+        cardinality = n_unique / n_total if n_total > 0 else 0
+        features.append(Feature(
+            name="cardinality",
+            value=cardinality,
+            feature_type=self.feature_type,
+        ))
+        # Is it low cardinality (categorical)?
+        is_categorical = 1.0 if n_unique < 20 and cardinality < 0.05 else 0.0
+        features.append(Feature(
+            name="is_categorical",
+            value=is_categorical,
+            feature_type=self.feature_type,
+        ))
+        # Is it high cardinality (identifier)?
+        is_identifier = 1.0 if cardinality > 0.95 else 0.0
+        features.append(Feature(
+            name="is_identifier",
+            value=is_identifier,
+            feature_type=self.feature_type,
+        ))
+        # Value frequency distribution
+        value_counts = non_null.value_counts()
+        if len(value_counts) > 0:
+            counts = value_counts.get_column("count").to_list()
+            max_freq = max(counts) / n_total
+            features.append(Feature(
+                name="max_frequency",
+                value=max_freq,
+                feature_type=self.feature_type,
+            ))
+            # Entropy
+            probs = [c / n_total for c in counts]
+            entropy = -sum(p * math.log2(p) for p in probs if p > 0)
+            max_entropy = math.log2(n_unique) if n_unique > 1 else 1
+            normalized_entropy = entropy / max_entropy if max_entropy > 0 else 0
+            features.append(Feature(
+                name="normalized_entropy",
+                value=normalized_entropy,
+                feature_type=self.feature_type,
+            ))
+        return features
+class ContextFeatureExtractor(FeatureExtractor):
+    """Extract features from column context."""
+    name = "context_features"
+    feature_type = FeatureType.CONTEXTUAL
+    def extract(
+        self,
+        column: pl.Series,
+        context: dict[str, Any],
+    ) -> list[Feature]:
+        features = []
+        # Table-level context
+        table_name = context.get("table_name", "").lower()
+        if table_name:
+            # Check if table name gives hints
+            if any(kw in table_name for kw in ["user", "customer", "member"]):
+                features.append(Feature(
+                    name="table_is_user_related",
+                    value=1.0,
+                    feature_type=self.feature_type,
+                ))
+            if any(kw in table_name for kw in ["order", "transaction", "payment"]):
+                features.append(Feature(
+                    name="table_is_transaction_related",
+                    value=1.0,
+                    feature_type=self.feature_type,
+                ))
+        # Column position
+        col_index = context.get("column_index", 0)
+        total_cols = context.get("total_columns", 1)
+        position_ratio = col_index / total_cols if total_cols > 0 else 0
+        features.append(Feature(
+            name="column_position",
+            value=position_ratio,
+            feature_type=self.feature_type,
+        ))
+        # First column is often ID
+        if col_index == 0:
+            features.append(Feature(
+                name="is_first_column",
+                value=1.0,
+                feature_type=self.feature_type,
+            ))
+        # Neighboring columns
+        neighbor_names = context.get("neighbor_columns", [])
+        for name in neighbor_names:
+            name = name.lower()
+            if "email" in name:
+                features.append(Feature(
+                    name="neighbor_has_email",
+                    value=1.0,
+                    feature_type=self.feature_type,
+                ))
+            if "name" in name:
+                features.append(Feature(
+                    name="neighbor_has_name",
+                    value=1.0,
+                    feature_type=self.feature_type,
+                ))
+        return features
+# =============================================================================
+# Feature Extractor Registry
+# =============================================================================
+class FeatureExtractorRegistry:
+    """Registry for feature extractors."""
+    def __init__(self) -> None:
+        self._extractors: dict[str, FeatureExtractor] = {}
+    def register(self, extractor: FeatureExtractor) -> None:
+        """Register an extractor."""
+        self._extractors[extractor.name] = extractor
+    def get(self, name: str) -> FeatureExtractor:
+        """Get extractor by name."""
+        if name not in self._extractors:
+            raise KeyError(f"Unknown extractor: {name}")
+        return self._extractors[name]
+    def list_extractors(self) -> list[str]:
+        """List registered extractors."""
+        return list(self._extractors.keys())
+    def extract_all(
+        self,
+        column: pl.Series,
+        context: dict[str, Any],
+    ) -> FeatureVector:
+        """Extract features using all registered extractors."""
+        all_features = []
+        for extractor in self._extractors.values():
+            try:
+                features = extractor.extract(column, context)
+                all_features.extend(features)
+            except Exception as e:
+                logger.warning(f"Extractor {extractor.name} failed: {e}")
+        return FeatureVector(
+            column_name=context.get("column_name", column.name or ""),
+            features=all_features,
+        )
+# Global registry with default extractors
+feature_extractor_registry = FeatureExtractorRegistry()
+feature_extractor_registry.register(NameFeatureExtractor())
+feature_extractor_registry.register(ValueFeatureExtractor())
+feature_extractor_registry.register(StatisticalFeatureExtractor())
+feature_extractor_registry.register(ContextFeatureExtractor())
+# =============================================================================
+# ML Model Protocol
+# =============================================================================
+class InferenceModel(ABC):
+    """Abstract base for inference models."""
+    name: str = "base"
+    version: str = "1.0"
+    @abstractmethod
+    def predict(
+        self,
+        features: FeatureVector,
+    ) -> list[tuple[DataType, float]]:
+        """Predict type probabilities.
+        Args:
+            features: Extracted features
+        Returns:
+            List of (DataType, probability) sorted by probability
+        """
+        pass
+    @abstractmethod
+    def train(
+        self,
+        training_data: list[tuple[FeatureVector, DataType]],
+    ) -> None:
+        """Train/update the model.
+        Args:
+            training_data: List of (features, true_type) pairs
+        """
+        pass
+    def save(self, path: str | Path) -> None:
+        """Save model to file."""
+        pass
+    def load(self, path: str | Path) -> None:
+        """Load model from file."""
+        pass
+class RuleBasedModel(InferenceModel):
+    """Rule-based inference model.
+    Uses weighted rules derived from feature values to infer types.
+    Good baseline that doesn't require training data.
+    """
+    name = "rule_based"
+    version = "1.0"
+    def __init__(self) -> None:
+        # Define rules as (feature_name, operator, threshold, type, weight)
+        self.rules: list[tuple[str, str, float, DataType, float]] = [
+            # Email rules
+            ("email_pattern_ratio", ">=", 0.8, DataType.EMAIL, 0.9),
+            ("has_at_sign", ">=", 0.9, DataType.EMAIL, 0.7),
+            ("name_match_email", ">=", 0.5, DataType.EMAIL, 0.5),
+            # UUID rules
+            ("uuid_pattern_ratio", ">=", 0.8, DataType.UUID, 0.95),
+            ("name_match_uuid", ">=", 0.5, DataType.UUID, 0.6),
+            # Identifier rules
+            ("is_identifier", ">=", 0.9, DataType.IDENTIFIER, 0.7),
+            ("is_first_column", ">=", 0.5, DataType.IDENTIFIER, 0.3),
+            ("name_match_identifier", ">=", 0.5, DataType.IDENTIFIER, 0.4),
+            # Categorical rules
+            ("is_categorical", ">=", 0.8, DataType.CATEGORICAL, 0.8),
+            ("name_match_categorical", ">=", 0.5, DataType.CATEGORICAL, 0.4),
+            # Date/DateTime rules
+            ("name_match_date", ">=", 0.5, DataType.DATE, 0.5),
+            ("name_match_datetime", ">=", 0.5, DataType.DATETIME, 0.5),
+            # Numeric rules
+            ("is_currency_like", ">=", 0.8, DataType.CURRENCY, 0.7),
+            ("in_0_100_range", ">=", 0.9, DataType.PERCENTAGE, 0.5),
+            ("in_0_1_range", ">=", 0.95, DataType.PERCENTAGE, 0.6),
+            # Phone rules
+            ("name_match_phone", ">=", 0.5, DataType.PHONE, 0.5),
+            ("name_match_korean_phone", ">=", 0.5, DataType.KOREAN_PHONE, 0.6),
+            # Boolean
+            ("is_boolean", ">=", 0.9, DataType.BOOLEAN, 0.95),
+        ]
+    def predict(
+        self,
+        features: FeatureVector,
+    ) -> list[tuple[DataType, float]]:
+        """Apply rules to predict type."""
+        type_scores: dict[DataType, float] = defaultdict(float)
+        feature_dict = features.to_dict()
+        for feature_name, operator, threshold, dtype, weight in self.rules:
+            value = feature_dict.get(feature_name, 0.0)
+            match = False
+            if operator == ">=":
+                match = value >= threshold
+            elif operator == "<=":
+                match = value <= threshold
+            elif operator == "==":
+                match = abs(value - threshold) < 0.01
+            if match:
+                type_scores[dtype] += weight * value
+        # Normalize scores to probabilities
+        total = sum(type_scores.values())
+        if total > 0:
+            probabilities = [
+                (dtype, score / total)
+                for dtype, score in type_scores.items()
+            ]
+        else:
+            # Default to string if no rules match
+            probabilities = [(DataType.STRING, 0.5)]
+        # Sort by probability
+        probabilities.sort(key=lambda x: x[1], reverse=True)
+        return probabilities
+    def train(
+        self,
+        training_data: list[tuple[FeatureVector, DataType]],
+    ) -> None:
+        """Rule-based model doesn't need training, but could be tuned."""
+        pass
+class NaiveBayesModel(InferenceModel):
+    """Naive Bayes classifier for type inference.
+    Simple probabilistic model that works well with limited training data.
+    """
+    name = "naive_bayes"
+    version = "1.0"
+    def __init__(self) -> None:
+        self.class_priors: dict[DataType, float] = {}
+        self.feature_likelihoods: dict[str, dict[DataType, tuple[float, float]]] = {}
+        self._trained = False
+    def predict(
+        self,
+        features: FeatureVector,
+    ) -> list[tuple[DataType, float]]:
+        """Predict using Naive Bayes."""
+        if not self._trained:
+            # Fall back to rule-based if not trained
+            return RuleBasedModel().predict(features)
+        log_posteriors: dict[DataType, float] = {}
+        feature_dict = features.to_dict()
+        for dtype, prior in self.class_priors.items():
+            log_posterior = math.log(prior + 1e-10)
+            for feature_name, value in feature_dict.items():
+                if feature_name in self.feature_likelihoods:
+                    mean, std = self.feature_likelihoods[feature_name].get(
+                        dtype, (0.5, 0.3)
+                    )
+                    # Gaussian likelihood
+                    if std > 0:
+                        z = (value - mean) / std
+                        log_likelihood = -0.5 * z * z - math.log(std) - 0.5 * math.log(2 * math.pi)
+                        log_posterior += log_likelihood
+            log_posteriors[dtype] = log_posterior
+        # Convert to probabilities
+        max_log = max(log_posteriors.values())
+        exp_posteriors = {
+            dtype: math.exp(lp - max_log)
+            for dtype, lp in log_posteriors.items()
+        }
+        total = sum(exp_posteriors.values())
+        probabilities = [
+            (dtype, prob / total)
+            for dtype, prob in exp_posteriors.items()
+        ]
+        probabilities.sort(key=lambda x: x[1], reverse=True)
+        return probabilities
+    def train(
+        self,
+        training_data: list[tuple[FeatureVector, DataType]],
+    ) -> None:
+        """Train Naive Bayes classifier."""
+        if not training_data:
+            return
+        # Count classes
+        class_counts: dict[DataType, int] = Counter()
+        feature_values: dict[str, dict[DataType, list[float]]] = defaultdict(
+            lambda: defaultdict(list)
+        )
+        for features, dtype in training_data:
+            class_counts[dtype] += 1
+            for f in features.features:
+                feature_values[f.name][dtype].append(f.value)
+        # Calculate priors
+        total = sum(class_counts.values())
+        self.class_priors = {
+            dtype: count / total
+            for dtype, count in class_counts.items()
+        }
+        # Calculate feature likelihoods (mean, std for each feature per class)
+        for feature_name, class_values in feature_values.items():
+            self.feature_likelihoods[feature_name] = {}
+            for dtype, values in class_values.items():
+                if values:
+                    mean = sum(values) / len(values)
+                    variance = sum((v - mean) ** 2 for v in values) / len(values)
+                    std = math.sqrt(variance) if variance > 0 else 0.1
+                    self.feature_likelihoods[feature_name][dtype] = (mean, std)
+        self._trained = True
+    def save(self, path: str | Path) -> None:
+        """Save model to file."""
+        data = {
+            "class_priors": {k.value: v for k, v in self.class_priors.items()},
+            "feature_likelihoods": {
+                fname: {dtype.value: stats for dtype, stats in class_stats.items()}
+                for fname, class_stats in self.feature_likelihoods.items()
+            },
+            "trained": self._trained,
+        }
+        with open(path, "w") as f:
+            json.dump(data, f)
+    def load(self, path: str | Path) -> None:
+        """Load model from file."""
+        with open(path) as f:
+            data = json.load(f)
+        self.class_priors = {
+            DataType(k): v for k, v in data["class_priors"].items()
+        }
+        self.feature_likelihoods = {
+            fname: {DataType(dtype): tuple(stats) for dtype, stats in class_stats.items()}
+            for fname, class_stats in data["feature_likelihoods"].items()
+        }
+        self._trained = data["trained"]
+class EnsembleModel(InferenceModel):
+    """Ensemble of multiple models.
+    Combines predictions from multiple models using weighted voting.
+    """
+    name = "ensemble"
+    version = "1.0"
+    def __init__(
+        self,
+        models: list[tuple[InferenceModel, float]] | None = None,
+    ):
+        """Initialize ensemble.
+        Args:
+            models: List of (model, weight) tuples
+        """
+        self.models = models or [
+            (RuleBasedModel(), 0.6),
+            (NaiveBayesModel(), 0.4),
+        ]
+    def predict(
+        self,
+        features: FeatureVector,
+    ) -> list[tuple[DataType, float]]:
+        """Combine predictions from all models."""
+        combined_scores: dict[DataType, float] = defaultdict(float)
+        for model, weight in self.models:
+            predictions = model.predict(features)
+            for dtype, prob in predictions:
+                combined_scores[dtype] += weight * prob
+        # Normalize
+        total = sum(combined_scores.values())
+        if total > 0:
+            probabilities = [
+                (dtype, score / total)
+                for dtype, score in combined_scores.items()
+            ]
+        else:
+            probabilities = [(DataType.STRING, 1.0)]
+        probabilities.sort(key=lambda x: x[1], reverse=True)
+        return probabilities
+    def train(
+        self,
+        training_data: list[tuple[FeatureVector, DataType]],
+    ) -> None:
+        """Train all models in ensemble."""
+        for model, _ in self.models:
+            model.train(training_data)
+# =============================================================================
+# Model Registry
+# =============================================================================
+class ModelRegistry:
+    """Registry for inference models."""
+    def __init__(self) -> None:
+        self._models: dict[str, type[InferenceModel]] = {}
+    def register(
+        self,
+        name: str,
+        model_class: type[InferenceModel],
+    ) -> None:
+        """Register a model class."""
+        self._models[name] = model_class
+    def create(self, name: str, **kwargs: Any) -> InferenceModel:
+        """Create a model instance."""
+        if name not in self._models:
+            raise KeyError(f"Unknown model: {name}")
+        return self._models[name](**kwargs)
+    def list_models(self) -> list[str]:
+        """List available models."""
+        return list(self._models.keys())
+model_registry = ModelRegistry()
+model_registry.register("rule_based", RuleBasedModel)
+model_registry.register("naive_bayes", NaiveBayesModel)
+model_registry.register("ensemble", EnsembleModel)
+# =============================================================================
+# ML Type Inferrer
+# =============================================================================
+@dataclass
+class InferrerConfig:
+    """Configuration for ML type inferrer."""
+    model: str = "ensemble"
+    confidence_threshold: float = 0.5
+    use_caching: bool = True
+    cache_size: int = 1000
+    enable_learning: bool = True
+    model_path: str | None = None
+class MLTypeInferrer:
+    """ML-based type inferrer.
+    Main interface for ML-powered type inference.
+    Example:
+        inferrer = MLTypeInferrer()
+        result = inferrer.infer(column, context={
+            "column_name": "email",
+            "table_name": "users",
+        })
+        print(f"Inferred: {result.inferred_type} ({result.confidence:.0%})")
+    """
+    def __init__(
+        self,
+        model: str | InferenceModel = "ensemble",
+        config: InferrerConfig | None = None,
+    ):
+        self.config = config or InferrerConfig()
+        if isinstance(model, InferenceModel):
+            self._model = model
+        else:
+            self._model = model_registry.create(model)
+        self._feature_registry = feature_extractor_registry
+        self._cache: dict[str, InferenceResult] = {}
+        self._feedback_buffer: list[tuple[FeatureVector, DataType]] = []
+        self._lock = threading.Lock()
+        # Load saved model if path provided
+        if self.config.model_path and Path(self.config.model_path).exists():
+            self._model.load(self.config.model_path)
+    def infer(
+        self,
+        column: pl.Series,
+        context: dict[str, Any] | None = None,
+    ) -> InferenceResult:
+        """Infer column type using ML.
+        Args:
+            column: Column data
+            context: Additional context information
+        Returns:
+            Inference result with type and confidence
+        """
+        import time
+        start = time.time()
+        context = context or {}
+        context["column_name"] = context.get("column_name", column.name or "")
+        # Check cache
+        cache_key = self._make_cache_key(column, context)
+        if self.config.use_caching and cache_key in self._cache:
+            return self._cache[cache_key]
+        # Extract features
+        features = self._feature_registry.extract_all(column, context)
+        # Get predictions
+        predictions = self._model.predict(features)
+        if not predictions:
+            predictions = [(DataType.STRING, 0.5)]
+        # Build result
+        top_type, top_confidence = predictions[0]
+        alternatives = predictions[1:5]  # Top 5 alternatives
+        # Generate reasoning
+        reasoning = self._generate_reasoning(features, predictions)
+        elapsed_ms = (time.time() - start) * 1000
+        result = InferenceResult(
+            column_name=context["column_name"],
+            inferred_type=top_type,
+            confidence=top_confidence,
+            alternatives=alternatives,
+            reasoning=reasoning,
+            features_used=[f.name for f in features.features[:10]],
+            model_version=self._model.version,
+            inference_time_ms=elapsed_ms,
+        )
+        # Cache result
+        if self.config.use_caching:
+            with self._lock:
+                self._cache[cache_key] = result
+                # LRU eviction
+                if len(self._cache) > self.config.cache_size:
+                    oldest_key = next(iter(self._cache))
+                    del self._cache[oldest_key]
+        return result
+    def infer_table(
+        self,
+        df: pl.DataFrame,
+        table_name: str = "",
+    ) -> dict[str, InferenceResult]:
+        """Infer types for all columns in a table.
+        Args:
+            df: DataFrame to analyze
+            table_name: Table name for context
+        Returns:
+            Dictionary mapping column names to results
+        """
+        results = {}
+        columns = df.columns
+        for i, col_name in enumerate(columns):
+            # Build context with neighboring columns
+            neighbors = []
+            if i > 0:
+                neighbors.append(columns[i - 1])
+            if i < len(columns) - 1:
+                neighbors.append(columns[i + 1])
+            context = {
+                "column_name": col_name,
+                "table_name": table_name,
+                "column_index": i,
+                "total_columns": len(columns),
+                "neighbor_columns": neighbors,
+            }
+            result = self.infer(df.get_column(col_name), context)
+            results[col_name] = result
+        return results
+    def provide_feedback(
+        self,
+        column: pl.Series,
+        true_type: DataType,
+        context: dict[str, Any] | None = None,
+    ) -> None:
+        """Provide feedback for online learning.
+        Args:
+            column: Column that was classified
+            true_type: The correct type
+            context: Context used during inference
+        """
+        if not self.config.enable_learning:
+            return
+        context = context or {}
+        context["column_name"] = context.get("column_name", column.name or "")
+        features = self._feature_registry.extract_all(column, context)
+        with self._lock:
+            self._feedback_buffer.append((features, true_type))
+            # Retrain when buffer is large enough
+            if len(self._feedback_buffer) >= 100:
+                self._model.train(self._feedback_buffer)
+                self._feedback_buffer.clear()
+                # Save model if path configured
+                if self.config.model_path:
+                    self._model.save(self.config.model_path)
+    def _make_cache_key(
+        self,
+        column: pl.Series,
+        context: dict[str, Any],
+    ) -> str:
+        """Create cache key for column + context."""
+        # Use column sample and context for key
+        sample = column.head(10).to_list()
+        key_data = f"{context.get('column_name', '')}:{sample}:{column.dtype}"
+        return hashlib.md5(key_data.encode()).hexdigest()
+    def _generate_reasoning(
+        self,
+        features: FeatureVector,
+        predictions: list[tuple[DataType, float]],
+    ) -> list[str]:
+        """Generate human-readable reasoning."""
+        reasoning = []
+        # Get top features
+        sorted_features = sorted(
+            features.features,
+            key=lambda f: abs(f.value - 0.5),  # Deviation from neutral
+            reverse=True,
+        )
+        for f in sorted_features[:5]:
+            if f.value > 0.7:
+                reasoning.append(f"High {f.name}: {f.value:.2f}")
+            elif f.value < 0.3:
+                reasoning.append(f"Low {f.name}: {f.value:.2f}")
+        if predictions:
+            top_type, top_conf = predictions[0]
+            reasoning.append(f"Best match: {top_type.value} ({top_conf:.0%})")
+        return reasoning
+    def clear_cache(self) -> None:
+        """Clear inference cache."""
+        with self._lock:
+            self._cache.clear()
+# =============================================================================
+# Convenience Functions
+# =============================================================================
+def create_inference_model(
+    model_type: str = "ensemble",
+    **kwargs: Any,
+) -> InferenceModel:
+    """Create an inference model.
+    Args:
+        model_type: Model type name
+        **kwargs: Model configuration
+    Returns:
+        Configured model
+    """
+    return model_registry.create(model_type, **kwargs)
+def infer_column_type_ml(
+    column: pl.Series,
+    context: dict[str, Any] | None = None,
+    model: str = "ensemble",
+) -> InferenceResult:
+    """Infer column type using ML.
+    Args:
+        column: Column to analyze
+        context: Additional context
+        model: Model to use
+    Returns:
+        Inference result
+    """
+    inferrer = MLTypeInferrer(model=model)
+    return inferrer.infer(column, context)
+def infer_table_types_ml(
+    df: pl.DataFrame,
+    table_name: str = "",
+    model: str = "ensemble",
+) -> dict[str, InferenceResult]:
+    """Infer types for all columns in a table.
+    Args:
+        df: DataFrame to analyze
+        table_name: Table name for context
+        model: Model to use
+    Returns:
+        Dictionary of column results
+    """
+    inferrer = MLTypeInferrer(model=model)
+    return inferrer.infer_table(df, table_name)