PyPI - churnkit - Versions diffs - 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl - Mend

churnkit 0.76.0a3py3-none-any.whl → 0.76.1a2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{churnkit-0.76.0a3.data → churnkit-0.76.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb RENAMED Viewed

@@ -112,22 +112,27 @@
    "outputs": [],
    "source": [
     "from customer_retention.analysis.notebook_progress import track_and_export_previous\n",
+    "\n",
     "track_and_export_previous(\"02a_text_columns_deep_dive.ipynb\")\n",
     "\n",
-    "from customer_retention.analysis.auto_explorer import ExplorationFindings, TextProcessingMetadata\n",
-    "from customer_retention.analysis.visualization import ChartBuilder, display_figure, display_table, console\n",
-    "from customer_retention.core.config.column_config import ColumnType\n",
-    "from customer_retention.stages.profiling import (\n",
-    "    TextColumnProcessor, TextProcessingConfig, TextColumnResult,\n",
-    "    TextEmbedder, TextDimensionalityReducer,\n",
-    "    EMBEDDING_MODELS, get_model_info, list_available_models\n",
-    ")\n",
-    "import pandas as pd\n",
     "import numpy as np\n",
-    "import plotly.graph_objects as go\n",
     "import plotly.express as px\n",
+    "import plotly.graph_objects as go\n",
     "from plotly.subplots import make_subplots\n",
-    "from customer_retention.core.config.experiments import FINDINGS_DIR, EXPERIMENTS_DIR, OUTPUT_DIR, setup_experiments_structure"
+    "\n",
+    "from customer_retention.analysis.auto_explorer import ExplorationFindings, TextProcessingMetadata\n",
+    "from customer_retention.analysis.visualization import ChartBuilder, display_figure\n",
+    "from customer_retention.core.config.column_config import ColumnType\n",
+    "from customer_retention.core.config.experiments import (\n",
+    "    EXPERIMENTS_DIR,\n",
+    "    FINDINGS_DIR,\n",
+    ")\n",
+    "from customer_retention.stages.profiling import (\n",
+    "    TextColumnProcessor,\n",
+    "    TextProcessingConfig,\n",
+    "    get_model_info,\n",
+    "    list_available_models,\n",
+    ")"
    ]
   },
   {
@@ -249,7 +254,7 @@
    },
    "outputs": [],
    "source": [
-    "from customer_retention.stages.temporal import load_data_with_snapshot_preference, TEMPORAL_METADATA_COLS\n",
+    "from customer_retention.stages.temporal import load_data_with_snapshot_preference\n",
     "\n",
     "df, data_source = load_data_with_snapshot_preference(findings, output_dir=str(FINDINGS_DIR))\n",
     "charts = ChartBuilder()\n",
@@ -427,27 +432,27 @@
     "        print(f\"\\n{'='*70}\")\n",
     "        print(f\"Column: {col_name}\")\n",
     "        print(f\"{'='*70}\")\n",
-    "        \n",
+    "\n",
     "        text_series = df[col_name].fillna(\"\")\n",
-    "        \n",
+    "\n",
     "        # Basic statistics\n",
     "        non_empty = (text_series.str.len() > 0).sum()\n",
     "        avg_length = text_series.str.len().mean()\n",
     "        max_length = text_series.str.len().max()\n",
-    "        \n",
-    "        print(f\"\\n\\U0001f4ca Statistics:\")\n",
+    "\n",
+    "        print(\"\\n\\U0001f4ca Statistics:\")\n",
     "        print(f\"   Total rows: {len(text_series):,}\")\n",
     "        print(f\"   Non-empty: {non_empty:,} ({non_empty/len(text_series)*100:.1f}%)\")\n",
     "        print(f\"   Avg length: {avg_length:.0f} characters\")\n",
     "        print(f\"   Max length: {max_length:,} characters\")\n",
-    "        \n",
+    "\n",
     "        # Sample texts\n",
-    "        print(f\"\\n\\U0001f4dd Sample texts:\")\n",
+    "        print(\"\\n\\U0001f4dd Sample texts:\")\n",
     "        samples = text_series[text_series.str.len() > 10].head(3)\n",
     "        for i, sample in enumerate(samples, 1):\n",
     "            truncated = sample[:100] + \"...\" if len(sample) > 100 else sample\n",
     "            print(f\"   {i}. {truncated}\")\n",
-    "        \n",
+    "\n",
     "        # Text length distribution\n",
     "        lengths = text_series.str.len()\n",
     "        fig = go.Figure()\n",
@@ -511,27 +516,27 @@
    "source": [
     "if text_columns:\n",
     "    processor = TextColumnProcessor(config)\n",
-    "    \n",
+    "\n",
     "    print(\"Processing TEXT columns...\")\n",
     "    print(\"(This may take a moment for large datasets)\\n\")\n",
-    "    \n",
+    "\n",
     "    results = []\n",
     "    df_processed = df.copy()\n",
-    "    \n",
+    "\n",
     "    for col_name in text_columns:\n",
     "        print(f\"\\n{'='*70}\")\n",
     "        print(f\"Processing: {col_name}\")\n",
     "        print(f\"{'='*70}\")\n",
-    "        \n",
+    "\n",
     "        df_processed, result = processor.process_column(df_processed, col_name)\n",
     "        results.append(result)\n",
-    "        \n",
-    "        print(f\"\\n\\u2705 Processing complete:\")\n",
+    "\n",
+    "        print(\"\\n\\u2705 Processing complete:\")\n",
     "        print(f\"   Embedding shape: {result.embeddings_shape}\")\n",
     "        print(f\"   Components kept: {result.n_components}\")\n",
     "        print(f\"   Explained variance: {result.explained_variance:.1%}\")\n",
     "        print(f\"   Features created: {', '.join(result.component_columns)}\")\n",
-    "    \n",
+    "\n",
     "    print(f\"\\n\\n{'='*70}\")\n",
     "    print(\"PROCESSING SUMMARY\")\n",
     "    print(f\"{'='*70}\")\n",
@@ -586,32 +591,32 @@
     "        print(f\"\\n{'='*70}\")\n",
     "        print(f\"Results: {result.column_name}\")\n",
     "        print(f\"{'='*70}\")\n",
-    "        \n",
+    "\n",
     "        # Explained variance per component\n",
     "        reducer = processor._reducers[result.column_name]\n",
     "        var_ratios = reducer._pca.explained_variance_ratio_\n",
     "        cumulative = np.cumsum(var_ratios)\n",
-    "        \n",
+    "\n",
     "        fig = make_subplots(rows=1, cols=2,\n",
     "                            subplot_titles=(\"Variance per Component\", \"Cumulative Variance\"))\n",
-    "        \n",
+    "\n",
     "        fig.add_trace(go.Bar(\n",
     "            x=[f\"PC{i+1}\" for i in range(len(var_ratios))],\n",
     "            y=var_ratios,\n",
     "            marker_color='steelblue'\n",
     "        ), row=1, col=1)\n",
-    "        \n",
+    "\n",
     "        fig.add_trace(go.Scatter(\n",
     "            x=[f\"PC{i+1}\" for i in range(len(cumulative))],\n",
     "            y=cumulative,\n",
     "            mode='lines+markers',\n",
     "            line_color='green'\n",
     "        ), row=1, col=2)\n",
-    "        \n",
+    "\n",
     "        fig.add_hline(y=config.variance_threshold, line_dash=\"dash\", line_color=\"red\",\n",
     "                      annotation_text=f\"Target: {config.variance_threshold:.0%}\",\n",
     "                      row=1, col=2)\n",
-    "        \n",
+    "\n",
     "        fig.update_layout(\n",
     "            title=f\"PCA Results: {result.column_name}\",\n",
     "            height=400,\n",
@@ -621,7 +626,7 @@
     "        fig.update_yaxes(title_text=\"Variance Ratio\", row=1, col=1)\n",
     "        fig.update_yaxes(title_text=\"Cumulative Variance\", row=1, col=2)\n",
     "        display_figure(fig)\n",
-    "        \n",
+    "\n",
     "        # PC feature distributions\n",
     "        if len(result.component_columns) >= 2:\n",
     "            fig = px.scatter(\n",
@@ -687,16 +692,17 @@
     "            processing_approach=\"pca\"\n",
     "        )\n",
     "        findings.text_processing[result.column_name] = metadata\n",
-    "        \n",
+    "\n",
     "        print(f\"\\u2705 Added metadata for {result.column_name}:\")\n",
     "        print(f\"   Model: {metadata.embedding_model}\")\n",
     "        print(f\"   Components: {metadata.n_components}\")\n",
     "        print(f\"   Explained variance: {metadata.explained_variance:.1%}\")\n",
-    "    \n",
+    "\n",
     "    findings.save(FINDINGS_PATH)\n",
     "    print(f\"\\nFindings saved to: {FINDINGS_PATH}\")\n",
     "\n",
     "from customer_retention.analysis.notebook_html_exporter import export_notebook_html\n",
+    "\n",
     "export_notebook_html(Path(\"02a_text_columns_deep_dive.ipynb\"), EXPERIMENTS_DIR / \"docs\")\n"
    ]
   },
@@ -743,15 +749,15 @@
     "    print(\"\\n\" + \"=\"*70)\n",
     "    print(\"PRODUCTION RECOMMENDATIONS\")\n",
     "    print(\"=\"*70)\n",
-    "    \n",
+    "\n",
     "    for result in results:\n",
     "        print(f\"\\n\\U0001f527 {result.column_name}:\")\n",
-    "        print(f\"   Action: embed_reduce (embeddings + PCA)\")\n",
+    "        print(\"   Action: embed_reduce (embeddings + PCA)\")\n",
     "        print(f\"   Model: {config.embedding_model}\")\n",
     "        print(f\"   Variance threshold: {config.variance_threshold:.0%}\")\n",
     "        print(f\"   Expected features: {result.n_components}\")\n",
     "        print(f\"   Feature names: {', '.join(result.component_columns[:3])}...\")\n",
-    "    \n",
+    "\n",
     "    print(\"\\n\\U0001f4a1 These recommendations will be used by the pipeline generator.\")\n",
     "    print(\"   The same processing will be applied in production.\")"
    ]

churnkit 0.76.0a3__py3-none-any.whl → 0.76.1a2__py3-none-any.whl

churnkit 0.76.0a3py3-none-any.whl → 0.76.1a2py3-none-any.whl