PyPI - mdbq - Versions diffs - 0.0.8__tar.gz → 0.0.9__tar.gz - Mend

mdbq 0.0.8tar.gz → 0.0.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{mdbq-0.0.8 → mdbq-0.0.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: mdbq
-Version: 0.0.8
+Version: 0.0.9
 Home-page: https://pypi.org/project/mdbsql
 Author: xigua,
 Author-email: 2587125111@qq.com

mdbq-0.0.9/mdbq/dataframe/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@
1	+
2	+
3	+
4	+ # dataframe 优化

mdbq-0.0.9/mdbq/dataframe/converter.py ADDED Viewed

@@ -0,0 +1,57 @@
+# -*- coding:utf-8 -*-
+import pandas as pd
+import numpy as np
+import re
+class DataFrameConverter(object):
+    def __init__(self, df=pd.DataFrame({})):
+        self.df = df
+    def convert_df_cols(self, df=pd.DataFrame({})):
+        """
+        清理 dataframe 列名的不合规字符(mysql)
+        对数据类型进行转换(尝试将 object 类型转为 int 或 float)
+        """
+        if len(df) == 0:
+            df = self.df
+            if len(df) == 0:
+                return
+        # dtypes = df.dtypes.apply(str).to_dict()  # 将 dataframe 数据类型转为字典形式
+        df.replace([np.inf, -np.inf], 0, inplace=True)  # 清理一些非法值
+        df.replace(to_replace=['\\N', '-', '--', '', 'nan'], value=0, regex=False, inplace=True)  # 替换掉特殊字符
+        df.replace(to_replace=[','], value='', regex=True, inplace=True)
+        df.replace(to_replace=['="'], value='', regex=True, inplace=True)  # ="和"不可以放在一起清洗, 因为有: id=86785565
+        df.replace(to_replace=['"'], value='', regex=True, inplace=True)
+        cols = df.columns.tolist()
+        for col in cols:
+            # df[col] = df[col].apply(lambda x: re.sub('[="]', '', str(x)) if '="' in str(x) else x)
+            # 百分比在某些数据库中不兼容, 转换百分比为小数
+            df[col] = df[col].apply(lambda x: float(float((str(x).rstrip("%"))) / 100) if str(x).endswith('%') and '~' not in str(x) else x)
+            # 尝试转换合适的数据类型
+            if df[col].dtype == 'object':
+                try:
+                    df[col] = df[col].astype(int)  # 尝试转换 int
+                except:
+                    df[col] = df[col].astype('float64', errors='ignore')    # 尝试转换 float, 报错则忽略
+            if df[col].dtype == 'float':  # 对于小数类型, 保留 6 位小数
+                df[col] = df[col].apply(lambda x: round(float(x), 6) if x != 0 else x)
+            # 清理列名, 在 mysql 里面列名不能含有某些特殊字符
+            if '日期' in col or '时间' in col:
+                try:
+                    df[col] = df[col].apply(lambda x: pd.to_datetime(x))
+                except:
+                    pass
+            new_col = col.lower()
+            new_col = re.sub(r'[\',，（）()/=<>+\-*^"’\[\]~#|&% .;]', '_', new_col)
+            df.rename(columns={col: new_col}, inplace=True)
+        df.fillna(0, inplace=True)
+        return df
+if __name__ == '__main__':
+    df = pd.DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c'])
+    converter = DataFrameConverter()
+    df = converter.convert_df_cols(df)
+    print(df['a'].dtype)
+    print(df)

{mdbq-0.0.8 → mdbq-0.0.9}/mdbq/mongo/mongo.py RENAMED Viewed

@@ -10,6 +10,7 @@ import pymongo
 from functools import wraps
 from concurrent.futures import ThreadPoolExecutor
 from mdbq.config import get_myconf
+from mdbq.dataframe import converter
 warnings.filterwarnings('ignore')
@@ -209,15 +210,6 @@ class DownMongo:
         else:
             print(f'{now}正在下载数据 ({self.host}) {self.db_name}: {self.collection_name}, 数据区间: {self.start_date} ~ {self.end_date}')
-        # self.start_date = datetime.datetime.now() - datetime.timedelta(days=self.days)
-        # query = {  # 读取数据库中指定时间区间的数据
-        #     '日期':
-        #         {
-        #             '$gte': self.start_date,
-        #             '$lte': self.end_date
-        #         },
-        # }
-        # _df = pd.DataFrame((list(_collection.find(query))))  # 将数据转换为 dataframe
         if not self.start_date:
             self.start_date = datetime.datetime.now() - datetime.timedelta(days=self.days)
             self.end_date = datetime.datetime.now()
@@ -236,17 +228,16 @@ class DownMongo:
             # print(doc)
             datas.append(doc)
         _df = pd.DataFrame(datas)
+        if len(_df) == 0:
+            print(f'查询的数据量: {len(_df)}, 森么都米有花生')
+            self.client.close()
+            return
+        if '_id' in _df.columns.tolist():
+            _df.drop('_id', axis=1, inplace=True)
-        for col in _df.columns.tolist():  # 保存之前尝试转换数据类型
-            if '日期' in col:
-                _df[col] = _df[col].astype(str).apply(lambda x: ''.join(re.findall(r'(\d{4}-\d{2}-\d{2})', x)))
-                _df[col] = pd.to_datetime(_df[col], format='%Y-%m-%d', errors='ignore')  # 转换日期列
-            elif '_id' in col:
-                # _df['_id'] = _df['_id'].astype(str)  # 将 '_id' 字段转换为字符串，因为它是一个 ObjectId 对象
-                _df.drop('_id', axis=1, inplace=True)
-            else:
-                _df[col] = pd.to_numeric(_df[col], errors='ignore')
         print(f'查询的数据量: {len(_df)}')
+        cv = converter.DataFrameConverter()
+        _df = cv.convert_df_cols(_df)
         s_date = re.findall(r'(\d{4}-\d{2}-\d{2})', str(_df['日期'].values.min()))[0]
         e_date = re.findall(r'(\d{4}-\d{2}-\d{2})', str(_df['日期'].values.max()))[0]
         if not file_type.startswith('.'):
@@ -420,7 +411,8 @@ class UploadMongo:
         start_date = None
         end_date = None
-        df = self.convert_df_cols(df=df)  # 清理列名中的不合规字符
+        cv = converter.DataFrameConverter()
+        df = cv.convert_df_cols(df=df)  # 清理列名中的不合规字符
         if '日期' in df.columns.tolist():
             # df['日期'] = df['日期'].apply(lambda x: pd.to_datetime(x))
             collections.create_index([('日期', -1)], background=True)  # 必须, 创建索引, background 不阻塞
@@ -469,40 +461,6 @@ class UploadMongo:
         self.client.close()  #
-    def convert_df_cols(self, df):
-        """
-        清理 dataframe 列名的不合规字符(mysql)
-        对数据类型进行转换(尝试将 object 类型转为 int 或 float)
-        """
-        # dtypes = df.dtypes.apply(str).to_dict()  # 将 dataframe 数据类型转为字典形式
-        df.replace([np.inf, -np.inf], 0, inplace=True)  # 清理一些非法值
-        cols = df.columns.tolist()
-        df.replace(to_replace=['\\N', '-', '--', '', 'nan'], value=0, regex=False, inplace=True)  # 替换掉特殊字符
-        df.replace(to_replace=[','], value='', regex=True, inplace=True)
-        for col in cols:
-            # 百分比在某些数据库中不兼容, 转换百分比为小数
-            df[col] = df[col].apply(lambda x: float(float((str(x).rstrip("%"))) / 100) if str(x).endswith('%') and '~' not in str(x) else x)
-            # 尝试转换合适的数据类型
-            if df[col].dtype == 'object':
-                try:
-                    df[col] = df[col].astype(int)  # 尝试转换 int
-                except:
-                    df[col] = df[col].astype('float64', errors='ignore')    # 尝试转换 float, 报错则忽略
-            if df[col].dtype == 'float':  # 对于小数类型, 保留 6 位小数
-                df[col] = df[col].apply(lambda x: round(float(x), 6) if x != 0 else x)
-            # 清理列名, 在 mysql 里面列名不能含有某些特殊字符
-            if '日期' in col or '时间' in col:
-                try:
-                    df[col] = df[col].apply(lambda x: pd.to_datetime(x))
-                except:
-                    pass
-            new_col = col.lower()
-            new_col = re.sub(r'[\',，（）()/=<>+\-*^"’\[\]~#|&% .;]', '_', new_col)
-            df.rename(columns={col: new_col}, inplace=True)
-        df.fillna(0, inplace=True)
-        return df
 class OptimizeDatas:
     """

{mdbq-0.0.8 → mdbq-0.0.9}/mdbq/mysql/mysql.py RENAMED Viewed

@@ -12,6 +12,7 @@ from sqlalchemy import create_engine
 import os
 import calendar
 from mdbq.config import get_myconf
+from mdbq.dataframe import converter
 warnings.filterwarnings('ignore')
@@ -67,7 +68,8 @@ class MysqlUpload:
         """
         db_name = re.sub(r'[\',，（）()/=<>+\-*^"’\[\]~#|&% .]', '_', db_name)
         tabel_name = re.sub(r'[\',，（）()/=<>+\-*^"’\[\]~#|&% .]', '_', tabel_name)
-        df = self.convert_df_cols(df=df)  # 清理列名中的不合规字符
+        cv = converter.DataFrameConverter()
+        df = cv.convert_df_cols(df=df)  # 清理列名中的不合规字符
         connection = pymysql.connect(**self.config)  # 连接数据库
         try:
@@ -182,42 +184,6 @@ class MysqlUpload:
         else:
             return 'mediumtext'
-    def convert_df_cols(self, df):
-        """
-        清理 dataframe 列名的不合规字符(mysql)
-        对数据类型进行转换(尝试将 object 类型转为 int 或 float)
-        """
-        # dtypes = df.dtypes.apply(str).to_dict()  # 将 dataframe 数据类型转为字典形式
-        df.replace([np.inf, -np.inf], 0, inplace=True)  # 清理一些非法值
-        cols = df.columns.tolist()
-        df.replace(to_replace=['\\N', '-', '--', '', 'nan'], value=0, regex=False, inplace=True)  # 替换掉特殊字符
-        df.replace(to_replace=[','], value='', regex=True, inplace=True)
-        for col in cols:
-            # 百分比在某些数据库中不兼容, 转换百分比为小数
-            df[col] = df[col].apply(lambda x: float(float((str(x).rstrip("%"))) / 100) if str(x).endswith('%') and '~' not in str(x) else x)
-            # 尝试转换合适的数据类型
-            if df[col].dtype == 'object':
-                try:
-                    df[col] = df[col].astype(int)  # 尝试转换 int
-                except:
-                    df[col] = df[col].astype('float64', errors='ignore')    # 尝试转换 float, 报错则忽略
-            if df[col].dtype == 'float':  # 对于小数类型, 保留 6 位小数
-                df[col] = df[col].apply(lambda x: round(float(x), 6) if x != 0 else x)
-            # 清理列名, 在 mysql 里面列名不能含有某些特殊字符
-            if '日期' in col or '时间' in col:
-                try:
-                    df[col] = df[col].apply(lambda x: pd.to_datetime(x))
-                except:
-                    pass
-            new_col = col.lower()
-            new_col = re.sub(r'[\',，（）()/=<>+\-*^"’\[\]~#|&% .;]', '_', new_col)
-            df.rename(columns={col: new_col}, inplace=True)
-        df.fillna(0, inplace=True)
-        return df
     def upload_pandas(self, update_path, db_name, days=None):
         """
         专门用来上传 pandas数据源的全部文件,  跳过 '其他数据'  or '京东数据集'

{mdbq-0.0.8 → mdbq-0.0.9}/mdbq.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: mdbq
-Version: 0.0.8
+Version: 0.0.9
 Home-page: https://pypi.org/project/mdbsql
 Author: xigua,
 Author-email: 2587125111@qq.com

{mdbq-0.0.8 → mdbq-0.0.9}/mdbq.egg-info/SOURCES.txt RENAMED Viewed

@@ -6,9 +6,6 @@ mdbq.egg-info/PKG-INFO
 mdbq.egg-info/SOURCES.txt
 mdbq.egg-info/dependency_links.txt
 mdbq.egg-info/top_level.txt
-mdbq/aggregation/__init__.py
-mdbq/aggregation/aggregation.py
-mdbq/aggregation/query_data.py
 mdbq/bdup/__init__.py
 mdbq/bdup/bdup.py
 mdbq/clean/__init__.py
@@ -18,6 +15,8 @@ mdbq/company/copysh.py
 mdbq/config/__init__.py
 mdbq/config/get_myconf.py
 mdbq/config/update_conf.py
+mdbq/dataframe/__init__.py
+mdbq/dataframe/converter.py
 mdbq/log/__init__.py
 mdbq/log/mylogger.py
 mdbq/mongo/__init__.py

{mdbq-0.0.8 → mdbq-0.0.9}/setup.py RENAMED Viewed

@@ -3,7 +3,7 @@
 from setuptools import setup, find_packages
 setup(name='mdbq',
-      version='0.0.8',
+      version='0.0.9',
       author='xigua, ',
       author_email="2587125111@qq.com",
       url='https://pypi.org/project/mdbsql',

mdbq-0.0.8/mdbq/aggregation/__init__.py DELETED Viewed

	@@ -1,4 +0,0 @@
1	-
2	-
3	-
4	- # 数据清洗, 数据聚合, 入库

mdbq 0.0.8__tar.gz → 0.0.9__tar.gz

mdbq 0.0.8tar.gz → 0.0.9tar.gz