mdbq 2.2.2__py3-none-any.whl → 2.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,6 @@
1
1
  # -*- coding: UTF-8 –*-
2
+ import re
3
+
2
4
  from mdbq.mongo import mongo
3
5
  from mdbq.mysql import mysql
4
6
  from mdbq.mysql import s_query
@@ -14,8 +16,6 @@ import getpass
14
16
  import json
15
17
  import os
16
18
 
17
- from sqlalchemy.event import remove
18
-
19
19
  """
20
20
  程序用于下载数据库(调用 s_query.py 下载并清洗), 并对数据进行聚合清洗, 不会更新数据库信息;
21
21
 
@@ -862,7 +862,47 @@ class GroupBy:
862
862
  }
863
863
  )
864
864
  df.insert(loc=1, column='推广渠道', value='万相台无界版') # df中插入新列
865
+ # 1. 匹配 L后面接 2 个或以上数字,不区分大小写,示例:L345
866
+ # 2. 其余情况,L 后面接多个数字的都会被第一条 if 命中,不区分大小写
867
+ df['消费力层级'] = df.apply(
868
+ lambda x:
869
+ ''.join(re.findall(r'(l\d+)', x['人群名字'].upper(), re.IGNORECASE)) if re.findall(r'(l\d{2,})', x['人群名字'], re.IGNORECASE)
870
+ else 'L5' if re.findall(r'(l\d*5)', x['人群名字'], re.IGNORECASE)
871
+ else 'L4' if re.findall(r'(l\d*4)', x['人群名字'], re.IGNORECASE)
872
+ else 'L3' if re.findall(r'(l\d*3)', x['人群名字'], re.IGNORECASE)
873
+ else 'L2' if re.findall(r'(l\d*2)', x['人群名字'], re.IGNORECASE)
874
+ else 'L1' if re.findall(r'(l\d*1)', x['人群名字'], re.IGNORECASE)
875
+ else '', axis=1)
876
+ # 1. 匹配连续的 4 个数字且后面不能接数字或"元"或汉字,筛掉的人群示例:月均消费6000元|受众20240729175213|xxx2024真皮公文包
877
+ # 2. 匹配 2数字_2数字且前面不能是数字,合法匹配:人群_30_50_促; 非法示例:L345_3040 避免识别出 35~20 岁用户的情况
878
+ # pattern = r'(\d{4})(?!\d|[\u4e00-\u9fa5])' # 匹配 4 个数字,后面不能接数字或汉字
879
+ # pattern = r'(?<![\d\u4e00-\u9fa5])(\d{4})' # 匹配前面不是数字或汉字的 4 个连续数字
880
+
881
+ # 匹配 4 个数字,前面和后面都不能是数字或汉字
882
+ pattern1 = r'(?<![\d\u4e00-\u9fa5])(\d{4})(?!\d|[\u4e00-\u9fa5])'
883
+ # 匹配指定字符,前面不能是数字或 l 或 L 开头
884
+ pattern2 = r'(?<![\dlL])(\d{2}_\d{2})'
885
+ df['用户年龄'] = df.apply(
886
+ lambda x:
887
+ ''.join(re.findall(pattern1, x['人群名字'].upper())) if re.findall(pattern1, x['人群名字'])
888
+ # else ''.join(re.findall(r'[^\d|l|L](\d{2}_\d{2})', x['人群名字'].upper())) if re.findall(r'[^\d|l|L](\d{2}_\d{2})', x['人群名字'])
889
+ else ''.join(re.findall(pattern2, x['人群名字'].upper())) if re.findall(pattern2, x['人群名字'])
890
+ else ''.join(re.findall(r'(\d{2}-\d{2})岁', x['人群名字'].upper())) if re.findall(r'(\d{2}-\d{2})岁', x['人群名字'])
891
+ else '', axis=1)
892
+ df['用户年龄'] = df['用户年龄'].apply(
893
+ lambda x: f'{x[:2]}~{x[2:4]}' if str(x).isdigit()
894
+ else str(x).replace('_', '~') if '_' in x
895
+ else str(x).replace('-', '~') if '-' in x
896
+ else x
897
+ )
898
+ # 年龄层不能是 0 开头
899
+ df['用户年龄'] = df['用户年龄'].apply(
900
+ lambda x: '' if str(x).startswith('0') else x)
901
+ # df = df.head(1000)
902
+ # df.to_csv('/Users/xigua/Downloads/test.csv', index=False, header=True, encoding='utf-8_sig')
903
+ # breakpoint()
865
904
  return df
905
+
866
906
  elif '天猫_关键词报表' in table_name:
867
907
  df.rename(columns={
868
908
  '场景名字': '营销场景',
@@ -1774,4 +1814,3 @@ if __name__ == '__main__':
1774
1814
  data_aggregation(service_databases=[{'company': 'mysql'}], months=1) # 正常的聚合所有数据
1775
1815
  # data_aggregation_one(service_databases=[{'company': 'mysql'}], months=1) # 单独聚合某一个数据库,具体库进函数编辑
1776
1816
  # optimize_data.op_data(service_databases=[{'company': 'mysql'}], days=3650) # 立即启动对聚合数据的清理工作
1777
-
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mdbq
3
- Version: 2.2.2
3
+ Version: 2.2.4
4
4
  Home-page: https://pypi.org/project/mdbsql
5
5
  Author: xigua,
6
6
  Author-email: 2587125111@qq.com
@@ -5,7 +5,7 @@ mdbq/aggregation/aggregation.py,sha256=98pECXV6yw7XSjoLnJBgHIQWM2s2aaB8ii5qNebAI
5
5
  mdbq/aggregation/df_types.py,sha256=U9i3q2eRPTDY8qAPTw7irzu-Tlg4CIySW9uYro81wdk,8125
6
6
  mdbq/aggregation/mysql_types.py,sha256=DQYROALDiwjJzjhaJfIIdnsrNs11i5BORlj_v6bp67Y,11062
7
7
  mdbq/aggregation/optimize_data.py,sha256=Wis40oL04M7E1pkvgNPjyVFAUe-zgjimjIVAikxYY8Y,4418
8
- mdbq/aggregation/query_data.py,sha256=h8AHq0v8xvsGjqoCZkAH1ZXZ3n05Q-JETVR6WDWzCyg,82334
8
+ mdbq/aggregation/query_data.py,sha256=Za8shm_I9ESzfYUVPRPOh8kk3yVWwvWAvwV0mFVx5mI,85340
9
9
  mdbq/bdup/__init__.py,sha256=AkhsGk81SkG1c8FqDH5tRq-8MZmFobVbN60DTyukYTY,28
10
10
  mdbq/bdup/bdup.py,sha256=LAV0TgnQpc-LB-YuJthxb0U42_VkPidzQzAagan46lU,4234
11
11
  mdbq/clean/__init__.py,sha256=A1d6x3L27j4NtLgiFV5TANwEkLuaDfPHDQNrPBbNWtU,41
@@ -38,7 +38,7 @@ mdbq/pbix/pbix_refresh.py,sha256=JUjKW3bNEyoMVfVfo77UhguvS5AWkixvVhDbw4_MHco,239
38
38
  mdbq/pbix/refresh_all.py,sha256=viOlLCmz9zg61Q2nzjgl8dChfQxnxRd1A_jmQMb2oDM,5918
39
39
  mdbq/pbix/refresh_all_old.py,sha256=_pq3WSQ728GPtEG5pfsZI2uTJhU8D6ra-htIk1JXYzw,7192
40
40
  mdbq/spider/__init__.py,sha256=RBMFXGy_jd1HXZhngB2T2XTvJqki8P_Fr-pBcwijnew,18
41
- mdbq-2.2.2.dist-info/METADATA,sha256=-_fzEPy1JZTOKPkqXghaBcg7ERBf8HzN3G3KOuX6Ino,245
42
- mdbq-2.2.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
43
- mdbq-2.2.2.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
44
- mdbq-2.2.2.dist-info/RECORD,,
41
+ mdbq-2.2.4.dist-info/METADATA,sha256=KiryBvuQemT-aGcf786FzHmwJJ3eXp7tAzIdHH6I4BM,245
42
+ mdbq-2.2.4.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
43
+ mdbq-2.2.4.dist-info/top_level.txt,sha256=2FQ-uLnCSB-OwFiWntzmwosW3X2Xqsg0ewh1axsaylA,5
44
+ mdbq-2.2.4.dist-info/RECORD,,
File without changes