PyPI - staran - Versions diffs - 0.6.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

staran 0.6.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

staran/__init__.py +10 -0
staran/tools/__init__.py +5 -5
staran-1.0.0.dist-info/METADATA +301 -0
staran-1.0.0.dist-info/RECORD +8 -0
staran/banks/__init__.py +0 -30
staran/banks/xinjiang_icbc/__init__.py +0 -90
staran/engines/__init__.py +0 -65
staran/engines/base.py +0 -255
staran/engines/hive.py +0 -163
staran/engines/spark.py +0 -252
staran/engines/turing.py +0 -439
staran/examples/__init__.py +0 -8
staran/examples/aum_longtail.py +0 -250
staran/examples/aum_longtail_old.py +0 -487
staran/features/__init__.py +0 -59
staran/features/engines.py +0 -284
staran/features/generator.py +0 -603
staran/features/manager.py +0 -155
staran/features/schema.py +0 -193
staran/models/__init__.py +0 -72
staran/models/bank_configs.py +0 -269
staran/models/config.py +0 -271
staran/models/daifa_models.py +0 -361
staran/models/registry.py +0 -281
staran/models/target.py +0 -321
staran/schemas/__init__.py +0 -27
staran/schemas/aum/__init__.py +0 -210
staran/schemas/document_generator.py +0 -350
staran/tools/document_generator.py +0 -350
staran-0.6.0.dist-info/METADATA +0 -564
staran-0.6.0.dist-info/RECORD +0 -33
{staran-0.6.0.dist-info → staran-1.0.0.dist-info}/WHEEL +0 -0
{staran-0.6.0.dist-info → staran-1.0.0.dist-info}/licenses/LICENSE +0 -0
{staran-0.6.0.dist-info → staran-1.0.0.dist-info}/top_level.txt +0 -0

staran/schemas/aum/__init__.py DELETED Viewed

@@ -1,210 +0,0 @@
-"""
-新疆工行代发长尾客户表结构定义模块
-包含新疆工行代发长尾客户相关的所有表结构：
-- 代发长尾客户行为特征表 (XinjiangICBCDaifaLongtailBehaviorSchema)
-- 代发长尾客户资产平均值表 (XinjiangICBCDaifaLongtailAssetAvgSchema)
-- 代发长尾客户资产配置表 (XinjiangICBCDaifaLongtailAssetConfigSchema)
-- 代发长尾客户月度统计表 (XinjiangICBCDaifaLongtailMonthlyStatSchema)
-数据库: xinjiang_icbc_daifa_longtail
-业务范围: 代发长尾客户（资产10k-100k）
-这些表结构可以用于：
-1. 代发长尾客户特征工程
-2. 提升模型和防流失模型构建
-3. 业务文档生成
-4. 数据质量检查
-"""
-from typing import Dict
-from ...features.schema import TableSchema
-from ...tools.document_generator import SchemaDocumentGenerator
-class XinjiangICBCDaifaLongtailBehaviorSchema:
-    """新疆工行代发长尾客户行为特征表 - 严格按照已提供给行方的字段"""
-    # 统计指标定义 - 与原始定义完全一致
-    _STATS = [
-        ("max", "最大值"),
-        ("min", "最小值"),
-        ("sum", "总和"),
-        ("avg", "均值"),
-        ("var", "方差"),
-        ("std", "标准差"),
-        ("rng", "极差"),
-        ("med", "中位数"),
-    ]
-    @staticmethod
-    def create() -> TableSchema:
-        """创建新疆工行代发长尾客户行为特征表结构"""
-        schema = TableSchema('xinjiang_icbc_daifa_hlwj_dfcw_f1_f4_wy')
-        schema.add_primary_key('party_id', 'string')
-        schema.add_date_field('data_dt', 'string')
-        # 基础字段 - 严格按照原始定义
-        schema.add_field("buy_ct", "int", comment="购买次数", aggregatable=True)
-        schema.add_field("recency", "int", comment="最近一次购买距今天数", aggregatable=True)
-        schema.add_field("tenure", "int", comment="客户关系持续时间", aggregatable=True)
-        schema.add_field("window1", "string", comment="时间窗口标记")
-        schema.add_field("freq", "float", comment="总购买频率", aggregatable=True)
-        schema.add_field("freq1", "float", comment="最近时间段购买频率", aggregatable=True)
-        schema.add_field("productidcount", "int", comment="产品种类数", aggregatable=True)
-        schema.add_field("orderidcount", "int", comment="订单数", aggregatable=True)
-        schema.add_field("label", "float", comment="标签值（如是否购买）", aggregatable=True)
-        # productamount and m1 ~ m4 的含义描述
-        m_fields = {
-            "productamount": "购买金额",
-            "m1": "去重订单数",
-            "m2": "去重商品数",
-            "m3": "去重渠道数",
-            "m4": "去重产品品类数",
-        }
-        # 使用循环注册 productamount and m1~m4 各统计字段
-        for prefix, meaning in m_fields.items():
-            for stat_key, stat_desc in XinjiangICBCDaifaLongtailBehaviorSchema._STATS:
-                field_name = f"{prefix}_{stat_key}"
-                description = f"{meaning}的{stat_desc}"
-                schema.add_field(field_name, "float", comment=description, aggregatable=True)
-        # 其他字段
-        schema.add_field("life_day", "float", comment="客户生命周期天数", aggregatable=True)
-        schema.add_field("gender", "float", comment="性别（编码）", aggregatable=True)
-        schema.add_field("open_day", "float", comment="开户天数", aggregatable=True)
-        schema.set_monthly_unique(False)  # 每人每日记录
-        return schema
-class XinjiangICBCDaifaLongtailAssetAvgSchema:
-    """新疆工行代发长尾客户资产平均值表"""
-    @staticmethod
-    def create() -> TableSchema:
-        """创建新疆工行代发长尾客户资产平均值表结构"""
-        schema = TableSchema('xinjiang_icbc_daifa_hlwj_zi_chan_avg_wy')
-        schema.add_primary_key('party_id', 'string')
-        schema.add_date_field('data_dt', 'string')
-        # 基础余额字段
-        schema.add_field("asset_total_bal", "decimal", comment="总资产余额", aggregatable=True)
-        schema.add_field("liab_total_bal", "decimal", comment="总负债余额", aggregatable=True)
-        schema.add_field("net_asset_bal", "decimal", comment="净资产余额", aggregatable=True)
-        # 存款相关字段
-        schema.add_field("dep_bal", "decimal", comment="存款余额", aggregatable=True)
-        schema.add_field("current_dep_bal", "decimal", comment="活期存款余额", aggregatable=True)
-        schema.add_field("time_dep_bal", "decimal", comment="定期存款余额", aggregatable=True)
-        # 理财投资字段
-        schema.add_field("wealth_bal", "decimal", comment="理财余额", aggregatable=True)
-        schema.add_field("fund_bal", "decimal", comment="基金余额", aggregatable=True)
-        schema.add_field("insurance_bal", "decimal", comment="保险余额", aggregatable=True)
-        schema.set_monthly_unique(True)  # 每人每月一条记录
-        return schema
-class XinjiangICBCDaifaLongtailAssetConfigSchema:
-    """新疆工行代发长尾客户资产配置表"""
-    @staticmethod
-    def create() -> TableSchema:
-        """创建新疆工行代发长尾客户资产配置表结构"""
-        schema = TableSchema('xinjiang_icbc_daifa_hlwj_zi_chan_config_wy')
-        schema.add_primary_key('party_id', 'string')
-        schema.add_date_field('data_dt', 'string')
-        # 资产配置比例字段
-        schema.add_field("cash_ratio", "float", comment="现金类资产占比", aggregatable=True)
-        schema.add_field("fixed_income_ratio", "float", comment="固收类资产占比", aggregatable=True)
-        schema.add_field("equity_ratio", "float", comment="权益类资产占比", aggregatable=True)
-        schema.add_field("alternative_ratio", "float", comment="另类资产占比", aggregatable=True)
-        # 风险偏好相关
-        schema.add_field("risk_level", "int", comment="风险偏好等级(1-5)", aggregatable=True)
-        schema.add_field("investment_experience", "int", comment="投资经验年限", aggregatable=True)
-        # 配置变化指标
-        schema.add_field("config_change_freq", "int", comment="配置调整频率", aggregatable=True)
-        schema.add_field("rebalance_count", "int", comment="再平衡次数", aggregatable=True)
-        schema.set_monthly_unique(True)
-        return schema
-class XinjiangICBCDaifaLongtailMonthlyStatSchema:
-    """新疆工行代发长尾客户月度统计表"""
-    @staticmethod
-    def create() -> TableSchema:
-        """创建新疆工行代发长尾客户月度统计表结构"""
-        schema = TableSchema('xinjiang_icbc_daifa_hlwj_monthly_stat_wy')
-        schema.add_primary_key('party_id', 'string')
-        schema.add_date_field('data_dt', 'string')
-        # 月度交易统计
-        schema.add_field("monthly_txn_count", "int", comment="月度交易笔数", aggregatable=True)
-        schema.add_field("monthly_txn_amount", "decimal", comment="月度交易金额", aggregatable=True)
-        schema.add_field("monthly_deposit_amount", "decimal", comment="月度存入金额", aggregatable=True)
-        schema.add_field("monthly_withdraw_amount", "decimal", comment="月度取出金额", aggregatable=True)
-        # 代发工资相关统计
-        schema.add_field("salary_amount", "decimal", comment="月度代发工资金额", aggregatable=True)
-        schema.add_field("salary_date", "string", comment="代发工资日期")
-        schema.add_field("salary_stability", "float", comment="工资稳定性指数", aggregatable=True)
-        # 长尾客户特征
-        schema.add_field("longtail_score", "float", comment="长尾客户评分", aggregatable=True)
-        schema.add_field("upgrade_potential", "float", comment="提升潜力评分", aggregatable=True)
-        schema.add_field("churn_risk", "float", comment="流失风险评分", aggregatable=True)
-        # 活跃度指标
-        schema.add_field("login_days", "int", comment="月度登录天数", aggregatable=True)
-        schema.add_field("channel_usage", "string", comment="渠道使用情况")
-        schema.set_monthly_unique(True)
-        return schema
-def get_xinjiang_icbc_daifa_longtail_schemas() -> Dict[str, TableSchema]:
-    """获取新疆工行代发长尾客户所有表结构"""
-    return {
-        'daifa_longtail_behavior': XinjiangICBCDaifaLongtailBehaviorSchema.create(),
-        'daifa_longtail_asset_avg': XinjiangICBCDaifaLongtailAssetAvgSchema.create(),
-        'daifa_longtail_asset_config': XinjiangICBCDaifaLongtailAssetConfigSchema.create(),
-        'daifa_longtail_monthly_stat': XinjiangICBCDaifaLongtailMonthlyStatSchema.create(),
-    }
-def export_xinjiang_icbc_daifa_longtail_docs(output_dir: str = "./docs") -> Dict[str, str]:
-    """导出新疆工行代发长尾客户表结构文档"""
-    generator = SchemaDocumentGenerator()
-    schemas = get_xinjiang_icbc_daifa_longtail_schemas()
-    exported_files = {}
-    for table_type, schema in schemas.items():
-        file_path = generator.export_schema_doc(
-            schema,
-            business_domain="新疆工行代发长尾客户",
-            table_type=table_type,
-            output_dir=output_dir
-        )
-        exported_files[table_type] = file_path
-    return exported_files
-# 导出主要组件
-__all__ = [
-    'XinjiangICBCDaifaLongtailBehaviorSchema',
-    'XinjiangICBCDaifaLongtailAssetAvgSchema',
-    'XinjiangICBCDaifaLongtailAssetConfigSchema',
-    'XinjiangICBCDaifaLongtailMonthlyStatSchema',
-    'get_xinjiang_icbc_daifa_longtail_schemas',
-    'export_xinjiang_icbc_daifa_longtail_docs'
-]

staran/schemas/document_generator.py DELETED Viewed

@@ -1,350 +0,0 @@
-"""
-Schema文档生成器
-支持根据表结构定义生成多种格式的技术文档：
-- Markdown格式：适合开发团队和版本控制
-- PDF格式：适合正式交付和业务方审阅
-- HTML格式：适合在线查看和分享
-主要功能：
-1. 表结构自动解析
-2. 字段信息格式化
-3. 业务含义说明
-4. 技术规范文档
-5. 多格式导出支持
-"""
-import os
-from typing import Dict, List, Optional
-from datetime import datetime
-class SchemaDocumentGenerator:
-    """表结构文档生成器"""
-    def __init__(self):
-        self.template_configs = {
-            'markdown': {
-                'extension': '.md',
-                'header_template': self._get_markdown_header_template(),
-                'field_template': self._get_markdown_field_template(),
-                'footer_template': self._get_markdown_footer_template()
-            },
-            'pdf': {
-                'extension': '.pdf',
-                'requires_conversion': True,
-                'base_format': 'markdown'  # 先生成MD再转PDF
-            },
-            'html': {
-                'extension': '.html',
-                'header_template': self._get_html_header_template(),
-                'field_template': self._get_html_field_template(),
-                'footer_template': self._get_html_footer_template()
-            }
-        }
-    def export_schema_doc(self, schema, business_domain: str, table_type: str,
-                         output_dir: str = "./docs", format_type: str = "markdown") -> str:
-        """
-        导出表结构文档
-        Args:
-            schema: TableSchema对象
-            business_domain: 业务域名称 (如: AUM, CRM, RISK)
-            table_type: 表类型 (如: behavior, asset_avg)
-            output_dir: 输出目录
-            format_type: 文档格式 ('markdown', 'pdf', 'html')
-        Returns:
-            生成的文档文件路径
-        """
-        # 确保输出目录存在
-        os.makedirs(output_dir, exist_ok=True)
-        # 生成文件名
-        timestamp = datetime.now().strftime("%Y%m%d")
-        filename = f"{business_domain}_{table_type}_schema_{timestamp}"
-        if format_type.lower() == 'pdf':
-            # PDF格式先生成Markdown再转换
-            md_content = self._generate_markdown_content(schema, business_domain, table_type)
-            md_path = os.path.join(output_dir, f"{filename}.md")
-            with open(md_path, 'w', encoding='utf-8') as f:
-                f.write(md_content)
-            # 转换为PDF (这里可以集成pandoc或其他转换工具)
-            pdf_path = os.path.join(output_dir, f"{filename}.pdf")
-            self._convert_md_to_pdf(md_path, pdf_path)
-            return pdf_path
-        elif format_type.lower() == 'html':
-            # HTML格式
-            html_content = self._generate_html_content(schema, business_domain, table_type)
-            html_path = os.path.join(output_dir, f"{filename}.html")
-            with open(html_path, 'w', encoding='utf-8') as f:
-                f.write(html_content)
-            return html_path
-        else:
-            # 默认Markdown格式
-            md_content = self._generate_markdown_content(schema, business_domain, table_type)
-            md_path = os.path.join(output_dir, f"{filename}.md")
-            with open(md_path, 'w', encoding='utf-8') as f:
-                f.write(md_content)
-            return md_path
-    def _generate_markdown_content(self, schema, business_domain: str, table_type: str) -> str:
-        """生成Markdown格式内容"""
-        content = []
-        # 文档头部
-        content.append(f"# {business_domain} - {table_type.upper()}表结构文档")
-        content.append("")
-        content.append(f"## 基本信息")
-        content.append("")
-        content.append(f"- **表名**: `{schema.table_name}`")
-        content.append(f"- **业务域**: {business_domain}")
-        content.append(f"- **表类型**: {table_type}")
-        content.append(f"- **生成时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-        content.append(f"- **月度唯一性**: {'是' if getattr(schema, 'is_monthly_unique', False) else '否'}")
-        content.append("")
-        # 表结构说明
-        content.append("## 表结构说明")
-        content.append("")
-        if hasattr(schema, 'description'):
-            content.append(f"{schema.description}")
-            content.append("")
-        # 主键信息
-        if hasattr(schema, 'primary_key') and schema.primary_key:
-            content.append("### 主键字段")
-            content.append("")
-            content.append(f"- `{schema.primary_key}` (主键)")
-            content.append("")
-        # 日期字段
-        if hasattr(schema, 'date_field') and schema.date_field:
-            content.append("### 日期字段")
-            content.append("")
-            content.append(f"- `{schema.date_field}` (日期字段)")
-            content.append("")
-        # 字段详情表格
-        content.append("## 字段详情")
-        content.append("")
-        content.append("| 字段名 | 数据类型 | 业务含义 | 可聚合 | 备注 |")
-        content.append("|--------|----------|----------|--------|------|")
-        if hasattr(schema, 'fields'):
-            for field_name, field in schema.fields.items():
-                # 简化数据类型显示
-                field_type_str = str(field.field_type) if hasattr(field, 'field_type') else 'string'
-                field_type = field_type_str.replace('FieldType.', '').lower()
-                comment = field.comment if hasattr(field, 'comment') else ''
-                aggregatable = '是' if getattr(field, 'aggregatable', False) else '否'
-                remarks = ''  # 可以从其他地方获取备注
-                content.append(f"| `{field_name}` | {field_type} | {comment} | {aggregatable} | {remarks} |")
-        content.append("")
-        # 业务规则说明
-        content.append("## 业务规则")
-        content.append("")
-        content.append("### 数据更新规则")
-        if getattr(schema, 'is_monthly_unique', False):
-            content.append("- 每人每月一条记录")
-            content.append("- 月末批量更新")
-        else:
-            content.append("- 每人每日一条记录")
-            content.append("- 日终批量更新")
-        content.append("")
-        content.append("### 数据质量要求")
-        content.append("- 主键字段不允许为空")
-        content.append("- 日期字段格式统一为YYYYMMDD")
-        content.append("- 金额字段精度保持2位小数")
-        content.append("- 比例字段取值范围[0,1]")
-        content.append("")
-        # 使用说明
-        content.append("## 使用说明")
-        content.append("")
-        content.append("### 特征工程配置")
-        if table_type == 'behavior':
-            content.append("- 生成原始拷贝特征")
-            content.append("- 生成聚合特征")
-            content.append("- 不生成环比、同比特征")
-        else:
-            content.append("- 生成聚合特征")
-            content.append("- 生成5个月环比特征")
-            content.append("- 生成1年同比特征")
-        content.append("")
-        content.append("### 示例SQL查询")
-        content.append("```sql")
-        content.append(f"-- 查询最新数据")
-        content.append(f"SELECT * FROM {schema.table_name}")
-        content.append(f"WHERE data_dt = (SELECT MAX(data_dt) FROM {schema.table_name})")
-        content.append(f"LIMIT 10;")
-        content.append("```")
-        content.append("")
-        # 文档尾部
-        content.append("---")
-        content.append("*本文档由Staran Schema自动生成*")
-        return "\n".join(content)
-    def _generate_html_content(self, schema, business_domain: str, table_type: str) -> str:
-        """生成HTML格式内容"""
-        # 基础HTML模板，可以根据需要扩展
-        html_content = f"""
-<!DOCTYPE html>
-<html lang="zh-CN">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>{business_domain} - {table_type.upper()}表结构文档</title>
-    <style>
-        body {{ font-family: 'Microsoft YaHei', Arial, sans-serif; margin: 40px; }}
-        h1, h2, h3 {{ color: #333; }}
-        table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
-        th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
-        th {{ background-color: #f2f2f2; font-weight: bold; }}
-        code {{ background-color: #f4f4f4; padding: 2px 4px; border-radius: 3px; }}
-        .info-table {{ background-color: #f9f9f9; }}
-    </style>
-</head>
-<body>
-    <h1>{business_domain} - {table_type.upper()}表结构文档</h1>
-    <h2>基本信息</h2>
-    <table class="info-table">
-        <tr><th>表名</th><td><code>{schema.table_name}</code></td></tr>
-        <tr><th>业务域</th><td>{business_domain}</td></tr>
-        <tr><th>表类型</th><td>{table_type}</td></tr>
-        <tr><th>生成时间</th><td>{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</td></tr>
-        <tr><th>月度唯一性</th><td>{'是' if getattr(schema, 'is_monthly_unique', False) else '否'}</td></tr>
-    </table>
-    <h2>字段详情</h2>
-    <table>
-        <thead>
-            <tr>
-                <th>字段名</th>
-                <th>数据类型</th>
-                <th>业务含义</th>
-                <th>可聚合</th>
-                <th>备注</th>
-            </tr>
-        </thead>
-        <tbody>
-"""
-        # 添加字段行
-        if hasattr(schema, 'fields'):
-            for field_name, field in schema.fields.items():
-                # 简化数据类型显示
-                field_type_str = str(field.field_type) if hasattr(field, 'field_type') else 'string'
-                field_type = field_type_str.replace('FieldType.', '').lower()
-                comment = field.comment if hasattr(field, 'comment') else ''
-                aggregatable = '是' if getattr(field, 'aggregatable', False) else '否'
-                remarks = ''  # 可以从其他地方获取备注
-                html_content += f"""
-            <tr>
-                <td><code>{field_name}</code></td>
-                <td>{field_type}</td>
-                <td>{comment}</td>
-                <td>{aggregatable}</td>
-                <td>{remarks}</td>
-            </tr>"""
-        html_content += """
-        </tbody>
-    </table>
-    <hr>
-    <p><em>本文档由Staran Schema自动生成</em></p>
-</body>
-</html>"""
-        return html_content
-    def _convert_md_to_pdf(self, md_path: str, pdf_path: str):
-        """将Markdown转换为PDF (需要安装pandoc或其他转换工具)"""
-        try:
-            import subprocess
-            # 尝试使用pandoc转换
-            subprocess.run([
-                'pandoc', md_path, '-o', pdf_path,
-                '--pdf-engine=xelatex',
-                '--variable=CJKmainfont:Microsoft YaHei'
-            ], check=True)
-        except (subprocess.CalledProcessError, FileNotFoundError):
-            # 如果pandoc不可用，创建一个说明文件
-            with open(pdf_path.replace('.pdf', '_conversion_note.txt'), 'w', encoding='utf-8') as f:
-                f.write(f"PDF转换说明：\\n")
-                f.write(f"原始Markdown文件：{md_path}\\n")
-                f.write(f"如需PDF格式，请安装pandoc工具：\\n")
-                f.write(f"pip install pandoc\\n")
-                f.write(f"或访问：https://pandoc.org/installing.html\\n")
-    def _get_markdown_header_template(self) -> str:
-        return "# {title}\\n\\n## 基本信息\\n\\n"
-    def _get_markdown_field_template(self) -> str:
-        return "| {name} | {type} | {comment} | {aggregatable} |\\n"
-    def _get_markdown_footer_template(self) -> str:
-        return "\\n---\\n*文档生成时间: {timestamp}*\\n"
-    def _get_html_header_template(self) -> str:
-        return "<h1>{title}</h1>\\n<h2>基本信息</h2>\\n"
-    def _get_html_field_template(self) -> str:
-        return "<tr><td>{name}</td><td>{type}</td><td>{comment}</td><td>{aggregatable}</td></tr>\\n"
-    def _get_html_footer_template(self) -> str:
-        return "<hr><p><em>文档生成时间: {timestamp}</em></p>\\n"
-def export_business_docs(business_domain: str, schemas_dict: Dict, output_dir: str = "./docs",
-                        format_type: str = "markdown") -> Dict[str, str]:
-    """
-    批量导出业务域表结构文档
-    Args:
-        business_domain: 业务域名称
-        schemas_dict: 表结构字典 {table_type: schema}
-        output_dir: 输出目录
-        format_type: 文档格式
-    Returns:
-        生成的文档文件路径字典
-    """
-    generator = SchemaDocumentGenerator()
-    results = {}
-    for table_type, schema in schemas_dict.items():
-        file_path = generator.export_schema_doc(
-            schema=schema,
-            business_domain=business_domain,
-            table_type=table_type,
-            output_dir=output_dir,
-            format_type=format_type
-        )
-        results[table_type] = file_path
-    return results
-__all__ = [
-    'SchemaDocumentGenerator',
-    'export_business_docs'
-]

staran 0.6.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

staran 0.6.0py3-none-any.whl → 1.0.0py3-none-any.whl