servify 0.0.43__tar.gz → 0.0.45__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {servify-0.0.43/servify.egg-info → servify-0.0.45}/PKG-INFO +1 -1
- {servify-0.0.43 → servify-0.0.45}/servify/__init__.py +1 -1
- servify-0.0.45/servify/commons/__init__.py +9 -0
- servify-0.0.45/servify/commons/func_read/utils/csv_reader.py +91 -0
- servify-0.0.45/servify/commons/func_read/utils/encoding.py +33 -0
- servify-0.0.45/servify/commons/func_read/utils/excel.py +208 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/utils/helpers.py +4 -4
- servify-0.0.45/servify/commons/func_read/utils/json_reader.py +50 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/utils/parquet_reader.py +3 -1
- servify-0.0.45/servify/commons/func_read/utils/paths.py +110 -0
- {servify-0.0.43 → servify-0.0.45/servify.egg-info}/PKG-INFO +1 -1
- {servify-0.0.43 → servify-0.0.45}/setup.py +1 -1
- servify-0.0.43/servify/commons/func_read/utils/csv_reader.py +0 -23
- servify-0.0.43/servify/commons/func_read/utils/encoding.py +0 -9
- servify-0.0.43/servify/commons/func_read/utils/excel.py +0 -15
- servify-0.0.43/servify/commons/func_read/utils/json_reader.py +0 -10
- servify-0.0.43/servify/commons/func_read/utils/paths.py +0 -14
- servify-0.0.43/servify/tests/__init__.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/LICENCE +0 -0
- {servify-0.0.43 → servify-0.0.45}/README.md +0 -0
- {servify-0.0.43 → servify-0.0.45}/pyproject.toml +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/__init__.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/base.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/errors.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/read.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/utils/__init__.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/utils/base.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/utils/schema.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/shared/__init__.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/shared/apply_schema.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/shared/base.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/commons/shared/errors.py +0 -0
- {servify-0.0.43/servify/commons → servify-0.0.45/servify/settings}/__init__.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/log/__init__.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/log/base.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/log/handlers.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/log/levels.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/log/logger.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/log/persist.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/log/protocols.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/log/servify_configs.py +0 -0
- {servify-0.0.43/servify/settings → servify-0.0.45/servify/settings/path}/__init__.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/path/paths.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/spark/__init__.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/spark/dbutils_helper.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/spark/environment.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/settings/spark/spark_session.py +0 -0
- {servify-0.0.43/servify/settings/path → servify-0.0.45/servify/tests}/__init__.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/tests/conftest.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify/tests/test_commons.py +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify.egg-info/SOURCES.txt +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify.egg-info/dependency_links.txt +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify.egg-info/requires.txt +0 -0
- {servify-0.0.43 → servify-0.0.45}/servify.egg-info/top_level.txt +0 -0
- {servify-0.0.43 → servify-0.0.45}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: servify
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.45
|
|
4
4
|
Summary: A Python library that simplifies data manipulation and workflow development with PySpark in Databricks environments.
|
|
5
5
|
Author: Felipe Pegoraro
|
|
6
6
|
Author-email: felipepegoraro93@gmail.com
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import re
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CsvReader:
|
|
7
|
+
|
|
8
|
+
def __init__(self, spark, log):
|
|
9
|
+
self.spark = spark
|
|
10
|
+
self.log = log
|
|
11
|
+
|
|
12
|
+
def detect_delimiter(self, file_path: str) -> str:
|
|
13
|
+
|
|
14
|
+
encoding_detectado = self.obter_enconding(file_path)
|
|
15
|
+
|
|
16
|
+
self.log.info(f"Starting delimiter detection for file: {file_path}")
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
with open(
|
|
20
|
+
file_path, "r", encoding=encoding_detectado, newline=""
|
|
21
|
+
) as f:
|
|
22
|
+
linha = f.readline()
|
|
23
|
+
if not linha:
|
|
24
|
+
self.log.warning(f"File is empty: {file_path}.")
|
|
25
|
+
|
|
26
|
+
self.log.debug(
|
|
27
|
+
f"first line read for delimiter detection: {linha.rstrip("\n")}"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
except Exception as e:
|
|
31
|
+
self.log.error(
|
|
32
|
+
f"Error reading file {file_path}: {e}", exc_info=True
|
|
33
|
+
)
|
|
34
|
+
raise ValueError(f"Error reading file {file_path}: {e}") from e
|
|
35
|
+
|
|
36
|
+
delimitadores = [",", ";", "\t", "|"]
|
|
37
|
+
|
|
38
|
+
contagem = {d: len(re.findall(re.escape(d), linha)) for d in delimitadores}
|
|
39
|
+
self.log.debug(f"Delimiter counts: {contagem}")
|
|
40
|
+
|
|
41
|
+
if all(c == 0 for c in contagem.values()):
|
|
42
|
+
self.log.warning(
|
|
43
|
+
f"No delimiters found in the first line of file: {file_path}. Trying csv.Sniffer...."
|
|
44
|
+
)
|
|
45
|
+
try:
|
|
46
|
+
dialect = csv.Sniffer().sniff(linha, delimiters="," ";|")
|
|
47
|
+
detected = dialect.delimiter
|
|
48
|
+
self.log.info(f"Delimiter detected by csv.Sniffer: {detected}")
|
|
49
|
+
return detected
|
|
50
|
+
except Exception as e:
|
|
51
|
+
self.log.error(
|
|
52
|
+
f"csv.Sniffer failed to detect delimiter for file {file_path}: {e}",
|
|
53
|
+
exc_info=True,
|
|
54
|
+
)
|
|
55
|
+
self.log.warning(
|
|
56
|
+
f"Using default delimiter ',' for file: {file_path}."
|
|
57
|
+
)
|
|
58
|
+
return ","
|
|
59
|
+
|
|
60
|
+
delimitador_detectado = max(contagem.items(), key=lambda kv: kv[1])[0]
|
|
61
|
+
self.log.info(
|
|
62
|
+
f"Delimiter detected: {delimitador_detectado} for file: {file_path}"
|
|
63
|
+
)
|
|
64
|
+
return delimitador_detectado
|
|
65
|
+
|
|
66
|
+
def read_csv(self, path: str, delimiter: str | None = None, encoding: str | None = None) -> DataFrame:
|
|
67
|
+
|
|
68
|
+
if delimiter or encoding:
|
|
69
|
+
self.log.info(
|
|
70
|
+
f"Using provided delimiter: '{delimiter}' and encoding: '{encoding}' for file: {path}."
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
else:
|
|
74
|
+
self.log.warning(
|
|
75
|
+
f"No delimiter or encoding provided. Starting auto-detection for file: {path}."
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
delimiter = ","
|
|
79
|
+
encoding = "utf-8"
|
|
80
|
+
|
|
81
|
+
self.log.info(f"Using default delimiter: '{delimiter}' and encoding: '{encoding}' for file: {path}.")
|
|
82
|
+
|
|
83
|
+
return (
|
|
84
|
+
self.spark.read
|
|
85
|
+
.option("header", True)
|
|
86
|
+
.option("inferSchema", "false")
|
|
87
|
+
.option("samplingRatio", 0.1)
|
|
88
|
+
.option("delimiter", delimiter)
|
|
89
|
+
.option("encoding", encoding)
|
|
90
|
+
.csv(path)
|
|
91
|
+
)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import chardet
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class EncodingUtils:
|
|
5
|
+
|
|
6
|
+
def detect_encoding(self, file_path: str, *, sample_bytes: int = 4096) -> str:
|
|
7
|
+
|
|
8
|
+
self.log.info(f"Initialized encoding detection for file: {file_path}")
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
with open(file_path, "rb") as f:
|
|
12
|
+
rawdata = f.read(sample_bytes)
|
|
13
|
+
result = chardet.detect(rawdata) or {}
|
|
14
|
+
encoding_detectado: str = result.get("encoding") or "utf-8"
|
|
15
|
+
|
|
16
|
+
conf = result.get("confidence")
|
|
17
|
+
self.log.debug(
|
|
18
|
+
f"Encoding detected: {encoding_detectado} with confidence: {conf} and language: {result.get('language')}"
|
|
19
|
+
)
|
|
20
|
+
except Exception as e:
|
|
21
|
+
self.log.error(
|
|
22
|
+
f"Error detecting encoding for file {file_path}: {e}",
|
|
23
|
+
exc_info=True,
|
|
24
|
+
)
|
|
25
|
+
raise ValueError(
|
|
26
|
+
f"Error detecting encoding for file {file_path}: {e}"
|
|
27
|
+
) from e
|
|
28
|
+
|
|
29
|
+
self.log.info(
|
|
30
|
+
f"Encoding detection completed for file: {file_path} - Encoding: {encoding_detectado}"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
return encoding_detectado
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import os
|
|
3
|
+
import math
|
|
4
|
+
import re
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql import types as T, functions as F
|
|
7
|
+
import numpy as np
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ExcelReader:
|
|
12
|
+
|
|
13
|
+
def __init__(self, spark, log):
|
|
14
|
+
self.spark = spark
|
|
15
|
+
self.log = log
|
|
16
|
+
|
|
17
|
+
def saniteze_columns(
|
|
18
|
+
self, header_cells: List, prefer_from_schema: Optional[T.StructType] = None
|
|
19
|
+
) -> List[str]:
|
|
20
|
+
|
|
21
|
+
target_names: List[str] = (
|
|
22
|
+
[f.name for f in prefer_from_schema] if prefer_from_schema else []
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
seen = set()
|
|
26
|
+
safe_cols = []
|
|
27
|
+
|
|
28
|
+
def is_blank(x):
|
|
29
|
+
if x is None:
|
|
30
|
+
return True
|
|
31
|
+
try:
|
|
32
|
+
if isinstance(x, float) and math.isnan(x):
|
|
33
|
+
return True
|
|
34
|
+
except Exception:
|
|
35
|
+
pass
|
|
36
|
+
s = str(x).strip()
|
|
37
|
+
return s == "" or s.lower() == "nan"
|
|
38
|
+
|
|
39
|
+
for i, h in enumerate(header_cells):
|
|
40
|
+
if is_blank(h):
|
|
41
|
+
if target_names and i < len(target_names):
|
|
42
|
+
base = target_names[i]
|
|
43
|
+
else:
|
|
44
|
+
base = f"c_{i+1}"
|
|
45
|
+
else:
|
|
46
|
+
base = str(h).replace("\n", " ").replace("\r", " ").strip()
|
|
47
|
+
|
|
48
|
+
base = re.sub(r"\s+", "_", base)
|
|
49
|
+
base = re.sub(r"[^0-9a-zA-Z_]", "", base)
|
|
50
|
+
|
|
51
|
+
if base[0].isdigit():
|
|
52
|
+
base = f"c_{base}"
|
|
53
|
+
|
|
54
|
+
name = base
|
|
55
|
+
k = 1
|
|
56
|
+
while name in seen:
|
|
57
|
+
name = f"{base}__{k}"
|
|
58
|
+
k += 1
|
|
59
|
+
seen.add(name)
|
|
60
|
+
safe_cols.append(name)
|
|
61
|
+
|
|
62
|
+
return safe_cols
|
|
63
|
+
|
|
64
|
+
def concat_ps_dfs(
|
|
65
|
+
self, lista_files: List[str], schema: Optional[T.StructType] = None
|
|
66
|
+
) -> DataFrame:
|
|
67
|
+
"""
|
|
68
|
+
Concatena uma lista de DataFrames do Pandas on Spark API.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
if not lista_files:
|
|
72
|
+
self.log.warning("No DataFrames provided for concatenation.")
|
|
73
|
+
raise ValueError("No DataFrames provided for concatenation.")
|
|
74
|
+
|
|
75
|
+
if len(lista_files) == 1:
|
|
76
|
+
self.log.info("Only one DataFrame provided, returning it directly.")
|
|
77
|
+
return self.read_xlsx_with_pandas(lista_files[0], schema=schema)
|
|
78
|
+
|
|
79
|
+
self.log.info(f"Concatenating {len(lista_files)} Pandas on Spark DataFrames")
|
|
80
|
+
|
|
81
|
+
df_final = self.read_xlsx_with_pandas(lista_files[0], schema=schema)
|
|
82
|
+
self.log.info(f"file: {lista_files[0]} read successfully.")
|
|
83
|
+
|
|
84
|
+
for file in lista_files[1:]:
|
|
85
|
+
df = self.read_xlsx_with_pandas(file, schema=schema)
|
|
86
|
+
df_final = df_final.unionByName(df, allowMissingColumns=True)
|
|
87
|
+
self.log.info(f"file: {file} read and concatenated successfully.")
|
|
88
|
+
|
|
89
|
+
self.log.info("DataFrames concatenated successfully.")
|
|
90
|
+
return df_final
|
|
91
|
+
|
|
92
|
+
def remove_header_rows(self, spark_df: DataFrame) -> DataFrame:
|
|
93
|
+
"""
|
|
94
|
+
Remove linhas de cabeçalho duplicadas de um DataFrame do Spark.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
self.log.info("Verify if having heands on lines to remove")
|
|
98
|
+
|
|
99
|
+
primeira_coluna = spark_df.columns[0]
|
|
100
|
+
|
|
101
|
+
valor_normalizado = F.lower(
|
|
102
|
+
F.regexp_replace(
|
|
103
|
+
F.trim(F.regexp_replace(F.col(primeira_coluna), "_", "")),
|
|
104
|
+
r"\s+",
|
|
105
|
+
" ",
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
nome_normalizado = primeira_coluna.lower().replace("_", "").strip()
|
|
110
|
+
|
|
111
|
+
existe_header = (
|
|
112
|
+
spark_df.filter(valor_normalizado == F.lit(nome_normalizado)).count() > 0
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if existe_header:
|
|
116
|
+
self.log.info("Header rows detected, removing them")
|
|
117
|
+
spark_df_limpo = spark_df.filter(
|
|
118
|
+
valor_normalizado != F.lit(nome_normalizado)
|
|
119
|
+
)
|
|
120
|
+
self.log.info("Header rows removed successfully")
|
|
121
|
+
return spark_df_limpo
|
|
122
|
+
|
|
123
|
+
self.log.info("No header rows detected, returning original DataFrame")
|
|
124
|
+
return spark_df
|
|
125
|
+
|
|
126
|
+
def read_xlsx_with_pandas(
|
|
127
|
+
self, xlsx_path: str, schema: Optional[T.StructType] = None, sheet_name: int = 0
|
|
128
|
+
):
|
|
129
|
+
"""
|
|
130
|
+
Lê um arquivo .xlsx em um DataFrame do Pandas on Spark API.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
self.log.info(f"Reading .xlsx file: {xlsx_path} by pandas")
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
df = pd.read_excel(
|
|
137
|
+
xlsx_path,
|
|
138
|
+
engine="openpyxl",
|
|
139
|
+
header=0,
|
|
140
|
+
sheet_name=sheet_name,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
valid_mask = df.notna().any(axis=1)
|
|
144
|
+
if not bool(valid_mask.all()):
|
|
145
|
+
self.log.warning(f"All rows are empty in .xlsx file: {xlsx_path}")
|
|
146
|
+
raise ValueError(f"All rows are empty in .xlsx file: {xlsx_path}")
|
|
147
|
+
|
|
148
|
+
first_valid_pos = int(np.argmax(valid_mask.to_numpy()))
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
self.log.warning(f"Error reading .xlsx file {xlsx_path} by pandas: {e}")
|
|
152
|
+
raise
|
|
153
|
+
|
|
154
|
+
header_raw = df.iloc[first_valid_pos].tolist()
|
|
155
|
+
safe_cols = self.saniteze_columns(header_raw, prefer_from_schema=schema)
|
|
156
|
+
|
|
157
|
+
df = df.dropna(how="all").reset_index(drop=True)
|
|
158
|
+
|
|
159
|
+
for c in range(len(safe_cols)):
|
|
160
|
+
col = df.columns[c]
|
|
161
|
+
df[col] = (
|
|
162
|
+
df[col]
|
|
163
|
+
.where(~df[col].isna(), None)
|
|
164
|
+
.map(lambda x: str(x) if x is not None else None)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if schema is None:
|
|
168
|
+
df.columns = safe_cols
|
|
169
|
+
|
|
170
|
+
self.log.info(
|
|
171
|
+
"Using safe conversion mode to avoid internal Serverless Arrow errors."
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
string_schema = T.StructType(
|
|
175
|
+
[T.StructField(c, T.StringType(), True) for c in df.columns]
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
sdf = self.spark.createDataFrame(df, schema=string_schema)
|
|
180
|
+
except Exception as e:
|
|
181
|
+
self.log.error(
|
|
182
|
+
"Spark Serverless failed to create DataFrame even in safe mode."
|
|
183
|
+
)
|
|
184
|
+
self.log.debug(f"Internal Spark error: {e}")
|
|
185
|
+
raise
|
|
186
|
+
|
|
187
|
+
sdf = sdf.select([F.col(c).cast("string").alias(c) for c in sdf.columns])
|
|
188
|
+
sdf = sdf.withColumn("source_file", F.lit(os.path.basename(xlsx_path)))
|
|
189
|
+
sdf = self.remove_header_rows(sdf)
|
|
190
|
+
|
|
191
|
+
return sdf
|
|
192
|
+
|
|
193
|
+
df.columns = safe_cols
|
|
194
|
+
|
|
195
|
+
string_schema = T.StructType(
|
|
196
|
+
[T.StructField(c, T.StringType(), True) for c in df.columns]
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
sdf = self.spark.createDataFrame(df, schema=string_schema)
|
|
200
|
+
|
|
201
|
+
sdf = sdf.withColumn("source_file", F.lit(os.path.basename(xlsx_path)))
|
|
202
|
+
|
|
203
|
+
sdf = self.commons_shared.aplicar_schema_df(sdf, schema)
|
|
204
|
+
|
|
205
|
+
sdf = self.remove_header_rows(sdf)
|
|
206
|
+
|
|
207
|
+
return sdf
|
|
208
|
+
|
|
@@ -22,10 +22,10 @@ class helper_reading_data(HelperBase):
|
|
|
22
22
|
# composição
|
|
23
23
|
self.paths = PathUtils()
|
|
24
24
|
self.encoding = EncodingUtils()
|
|
25
|
-
self.excel = ExcelReader(self.spark)
|
|
26
|
-
self.csv = CsvReader(self.spark)
|
|
27
|
-
self.json = JsonReader(self.spark)
|
|
28
|
-
self.parquet = ParquetReader(self.spark)
|
|
25
|
+
self.excel = ExcelReader(self.spark, self.log)
|
|
26
|
+
self.csv = CsvReader(self.spark, self.log)
|
|
27
|
+
self.json = JsonReader(self.spark, self.log)
|
|
28
|
+
self.parquet = ParquetReader(self.spark, self.log)
|
|
29
29
|
self.schema_utils = SchemaUtils(self.commons_shared)
|
|
30
30
|
|
|
31
31
|
# métodos de fachada, mantendo a API que o read.py espera
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class JsonReader:
|
|
5
|
+
|
|
6
|
+
def __init__(self, spark, log):
|
|
7
|
+
self.spark = spark
|
|
8
|
+
self.log = log
|
|
9
|
+
|
|
10
|
+
def detectar_json_multiline(self, path: str) -> bool:
|
|
11
|
+
|
|
12
|
+
self.log.info(f"Starting JSON multiline detection for file: {path}")
|
|
13
|
+
|
|
14
|
+
encoding_detectado = self.obter_enconding(path)
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
with open(path, "r", encoding=encoding_detectado) as f:
|
|
18
|
+
linhas = f.readlines()
|
|
19
|
+
self.log.info(f"File {path} read successfully.")
|
|
20
|
+
except Exception as e:
|
|
21
|
+
self.log.error(f"Error reading file {path}: {e}")
|
|
22
|
+
raise ValueError(f"Error reading file {path}: {e}") from e
|
|
23
|
+
|
|
24
|
+
primeira_linha = linhas[0].strip()
|
|
25
|
+
self.log.debug(f"First line for JSON multiline detection: {primeira_linha}")
|
|
26
|
+
|
|
27
|
+
if primeira_linha.startswith("{") or (
|
|
28
|
+
primeira_linha.startswith("[") and len(linhas) >= 1
|
|
29
|
+
):
|
|
30
|
+
self.log.info(f"JSON multiline detected for file: {path}")
|
|
31
|
+
return True
|
|
32
|
+
self.log.info(f"JSON single line detected for file: {path}")
|
|
33
|
+
return False
|
|
34
|
+
|
|
35
|
+
def read_json(self, path: str) -> DataFrame:
|
|
36
|
+
|
|
37
|
+
arquivo_escolhido = self.resolve_latest_file(path)
|
|
38
|
+
|
|
39
|
+
multiline = self.detectar_json_multiline(arquivo_escolhido)
|
|
40
|
+
|
|
41
|
+
if multiline is not None:
|
|
42
|
+
|
|
43
|
+
return self.spark.read.option("multiline", str(multiline).lower()).json(
|
|
44
|
+
path
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
self.log.info(
|
|
48
|
+
f"Using default JSON multiline 'false' for {path}."
|
|
49
|
+
)
|
|
50
|
+
return self.spark.read.json(path)
|
|
@@ -3,8 +3,10 @@ from pyspark.sql import DataFrame
|
|
|
3
3
|
|
|
4
4
|
class ParquetReader:
|
|
5
5
|
|
|
6
|
-
def __init__(self, spark):
|
|
6
|
+
def __init__(self, spark, log):
|
|
7
7
|
self.spark = spark
|
|
8
|
+
self.log = log
|
|
8
9
|
|
|
9
10
|
def read_parquet(self, path: str) -> DataFrame:
|
|
11
|
+
self.log.info(f"Reading Parquet file: {path}")
|
|
10
12
|
return self.spark.read.parquet(path)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PathUtils:
|
|
7
|
+
|
|
8
|
+
def list_xlsx_paths(self, directory: str) -> List[str]:
|
|
9
|
+
"""
|
|
10
|
+
Lista todos os arquivos .xlsx em um diretório especificado.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
self.log.info(f"Listing .xlsx files in directory: {directory}")
|
|
14
|
+
|
|
15
|
+
if os.path.isfile(directory):
|
|
16
|
+
if directory.lower().endswith(".xlsx"):
|
|
17
|
+
self.log.info(f"Single .xlsx file found: {directory}")
|
|
18
|
+
return [directory]
|
|
19
|
+
|
|
20
|
+
self.log.error(f"The specified path is a file but not .xlsx: {directory}")
|
|
21
|
+
raise ValueError(f"The specified path is a file but not .xlsx: {directory}")
|
|
22
|
+
|
|
23
|
+
if os.path.isdir(directory):
|
|
24
|
+
paths = [
|
|
25
|
+
os.path.join(directory, nome)
|
|
26
|
+
for nome in os.listdir(directory)
|
|
27
|
+
if nome.lower().endswith(".xlsx")
|
|
28
|
+
]
|
|
29
|
+
if not paths:
|
|
30
|
+
self.log.error(f"No .xlsx files found in directory: {directory}")
|
|
31
|
+
raise FileNotFoundError(
|
|
32
|
+
f"No .xlsx files found in directory: {directory}"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
self.log.info(f"Found {len(paths)} .xlsx files in directory: {directory}")
|
|
36
|
+
return paths
|
|
37
|
+
|
|
38
|
+
self.log.error(
|
|
39
|
+
f"The specified path is neither a file nor a directory: {directory}"
|
|
40
|
+
)
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"The specified path is neither a file nor a directory: {directory}"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def resolve_accessible_path(self, path: str, dbutils) -> str:
|
|
46
|
+
"""
|
|
47
|
+
Valida/resolve um path para leitura DBFS ou 'file:'.
|
|
48
|
+
- Com wildcard: garante que existe ao menos um arquivo, mantém wildcard para leitura.
|
|
49
|
+
- Sem wildcard: tenta DBFS, se não, tenta 'file:' se nada der certo, lança FileNotFoundError
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
if "*" in path:
|
|
53
|
+
arquivos = glob.glob(path.replace("file:", ""))
|
|
54
|
+
if not arquivos:
|
|
55
|
+
raise FileNotFoundError(f"No file founded in: {path}")
|
|
56
|
+
|
|
57
|
+
primeiro = arquivos[0]
|
|
58
|
+
try:
|
|
59
|
+
dbutils.fs.ls(primeiro)
|
|
60
|
+
except Exception:
|
|
61
|
+
arquivo_file = f"file:{primeiro}"
|
|
62
|
+
try:
|
|
63
|
+
dbutils.fs.ls(arquivo_file)
|
|
64
|
+
except Exception as exc_file:
|
|
65
|
+
raise FileNotFoundError(
|
|
66
|
+
f"File '{arquivo_file}' is not accessible by DBFS netheir 'file:'."
|
|
67
|
+
) from exc_file
|
|
68
|
+
|
|
69
|
+
return path
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
dbutils.fs.ls(path)
|
|
73
|
+
return path
|
|
74
|
+
except Exception:
|
|
75
|
+
path_file = f"file:{path}"
|
|
76
|
+
try:
|
|
77
|
+
dbutils.fs.ls(path_file)
|
|
78
|
+
return path_file
|
|
79
|
+
except Exception as exc_file:
|
|
80
|
+
raise FileNotFoundError(
|
|
81
|
+
f"File '{path_file}' is not accessible by DBFS netheir 'file:'."
|
|
82
|
+
) from exc_file
|
|
83
|
+
|
|
84
|
+
def resolve_latest_file(self, path: str) -> str:
|
|
85
|
+
|
|
86
|
+
self.log.debug(f"Resolving path: {path}")
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
path_resolvido = path.replace("file:", "")
|
|
90
|
+
except Exception as e:
|
|
91
|
+
self.log.error(f"Error resolving path: {e}")
|
|
92
|
+
raise ValueError(f"Error resolving path: {e}") from e
|
|
93
|
+
|
|
94
|
+
if "*" in path_resolvido:
|
|
95
|
+
arquivos = glob.glob(path_resolvido)
|
|
96
|
+
self.log.debug(f"Found files with wildcard: {arquivos}")
|
|
97
|
+
|
|
98
|
+
if not arquivos:
|
|
99
|
+
self.log.error(
|
|
100
|
+
f"No files found for path with wildcard: {path_resolvido}"
|
|
101
|
+
)
|
|
102
|
+
raise FileNotFoundError(f"No files found for path: {path_resolvido}")
|
|
103
|
+
|
|
104
|
+
arquivos.sort(key=os.path.getmtime, reverse=True)
|
|
105
|
+
escolhido = arquivos[0]
|
|
106
|
+
self.log.info(f"Latest file selected: {escolhido}")
|
|
107
|
+
return escolhido
|
|
108
|
+
|
|
109
|
+
self.log.info(f"Path resolved without wildcard: {path_resolvido}")
|
|
110
|
+
return path_resolvido
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: servify
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.45
|
|
4
4
|
Summary: A Python library that simplifies data manipulation and workflow development with PySpark in Databricks environments.
|
|
5
5
|
Author: Felipe Pegoraro
|
|
6
6
|
Author-email: felipepegoraro93@gmail.com
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
import csv
|
|
2
|
-
from pyspark.sql import DataFrame
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class CsvReader:
|
|
6
|
-
|
|
7
|
-
def __init__(self, spark):
|
|
8
|
-
self.spark = spark
|
|
9
|
-
|
|
10
|
-
def detect_delimiter(self, file_path: str) -> str:
|
|
11
|
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
12
|
-
sample = f.read(2048)
|
|
13
|
-
return csv.Sniffer().sniff(sample).delimiter
|
|
14
|
-
|
|
15
|
-
def read_csv(self, path: str, delimiter: str | None = None) -> DataFrame:
|
|
16
|
-
if delimiter is None:
|
|
17
|
-
delimiter = self.detect_delimiter(path)
|
|
18
|
-
return (
|
|
19
|
-
self.spark.read
|
|
20
|
-
.option("header", True)
|
|
21
|
-
.option("delimiter", delimiter)
|
|
22
|
-
.csv(path)
|
|
23
|
-
)
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from pyspark.sql import DataFrame
|
|
3
|
-
from pyspark.sql import types as T
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class ExcelReader:
|
|
7
|
-
|
|
8
|
-
def __init__(self, spark):
|
|
9
|
-
self.spark = spark
|
|
10
|
-
|
|
11
|
-
def concat_ps_dfs(self, paths, schema: T.StructType | None) -> DataFrame:
|
|
12
|
-
pdfs = [pd.read_excel(p) for p in paths]
|
|
13
|
-
pdf = pd.concat(pdfs, ignore_index=True)
|
|
14
|
-
return self.spark.createDataFrame(pdf, schema=schema)
|
|
15
|
-
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
import glob
|
|
2
|
-
import os
|
|
3
|
-
from typing import List
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class PathUtils:
|
|
7
|
-
|
|
8
|
-
def list_xlsx_paths(self, directory: str) -> List[str]:
|
|
9
|
-
return glob.glob(os.path.join(directory, "*.xlsx"))
|
|
10
|
-
|
|
11
|
-
def resolve_accessible_path(self, path: str) -> str:
|
|
12
|
-
if path.startswith("dbfs:"):
|
|
13
|
-
return path.replace("dbfs:", "/dbfs")
|
|
14
|
-
return path
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|