servify 0.0.43__tar.gz → 0.0.45__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {servify-0.0.43/servify.egg-info → servify-0.0.45}/PKG-INFO +1 -1
  2. {servify-0.0.43 → servify-0.0.45}/servify/__init__.py +1 -1
  3. servify-0.0.45/servify/commons/__init__.py +9 -0
  4. servify-0.0.45/servify/commons/func_read/utils/csv_reader.py +91 -0
  5. servify-0.0.45/servify/commons/func_read/utils/encoding.py +33 -0
  6. servify-0.0.45/servify/commons/func_read/utils/excel.py +208 -0
  7. {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/utils/helpers.py +4 -4
  8. servify-0.0.45/servify/commons/func_read/utils/json_reader.py +50 -0
  9. {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/utils/parquet_reader.py +3 -1
  10. servify-0.0.45/servify/commons/func_read/utils/paths.py +110 -0
  11. {servify-0.0.43 → servify-0.0.45/servify.egg-info}/PKG-INFO +1 -1
  12. {servify-0.0.43 → servify-0.0.45}/setup.py +1 -1
  13. servify-0.0.43/servify/commons/func_read/utils/csv_reader.py +0 -23
  14. servify-0.0.43/servify/commons/func_read/utils/encoding.py +0 -9
  15. servify-0.0.43/servify/commons/func_read/utils/excel.py +0 -15
  16. servify-0.0.43/servify/commons/func_read/utils/json_reader.py +0 -10
  17. servify-0.0.43/servify/commons/func_read/utils/paths.py +0 -14
  18. servify-0.0.43/servify/tests/__init__.py +0 -0
  19. {servify-0.0.43 → servify-0.0.45}/LICENCE +0 -0
  20. {servify-0.0.43 → servify-0.0.45}/README.md +0 -0
  21. {servify-0.0.43 → servify-0.0.45}/pyproject.toml +0 -0
  22. {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/__init__.py +0 -0
  23. {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/base.py +0 -0
  24. {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/errors.py +0 -0
  25. {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/read.py +0 -0
  26. {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/utils/__init__.py +0 -0
  27. {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/utils/base.py +0 -0
  28. {servify-0.0.43 → servify-0.0.45}/servify/commons/func_read/utils/schema.py +0 -0
  29. {servify-0.0.43 → servify-0.0.45}/servify/commons/shared/__init__.py +0 -0
  30. {servify-0.0.43 → servify-0.0.45}/servify/commons/shared/apply_schema.py +0 -0
  31. {servify-0.0.43 → servify-0.0.45}/servify/commons/shared/base.py +0 -0
  32. {servify-0.0.43 → servify-0.0.45}/servify/commons/shared/errors.py +0 -0
  33. {servify-0.0.43/servify/commons → servify-0.0.45/servify/settings}/__init__.py +0 -0
  34. {servify-0.0.43 → servify-0.0.45}/servify/settings/log/__init__.py +0 -0
  35. {servify-0.0.43 → servify-0.0.45}/servify/settings/log/base.py +0 -0
  36. {servify-0.0.43 → servify-0.0.45}/servify/settings/log/handlers.py +0 -0
  37. {servify-0.0.43 → servify-0.0.45}/servify/settings/log/levels.py +0 -0
  38. {servify-0.0.43 → servify-0.0.45}/servify/settings/log/logger.py +0 -0
  39. {servify-0.0.43 → servify-0.0.45}/servify/settings/log/persist.py +0 -0
  40. {servify-0.0.43 → servify-0.0.45}/servify/settings/log/protocols.py +0 -0
  41. {servify-0.0.43 → servify-0.0.45}/servify/settings/log/servify_configs.py +0 -0
  42. {servify-0.0.43/servify/settings → servify-0.0.45/servify/settings/path}/__init__.py +0 -0
  43. {servify-0.0.43 → servify-0.0.45}/servify/settings/path/paths.py +0 -0
  44. {servify-0.0.43 → servify-0.0.45}/servify/settings/spark/__init__.py +0 -0
  45. {servify-0.0.43 → servify-0.0.45}/servify/settings/spark/dbutils_helper.py +0 -0
  46. {servify-0.0.43 → servify-0.0.45}/servify/settings/spark/environment.py +0 -0
  47. {servify-0.0.43 → servify-0.0.45}/servify/settings/spark/spark_session.py +0 -0
  48. {servify-0.0.43/servify/settings/path → servify-0.0.45/servify/tests}/__init__.py +0 -0
  49. {servify-0.0.43 → servify-0.0.45}/servify/tests/conftest.py +0 -0
  50. {servify-0.0.43 → servify-0.0.45}/servify/tests/test_commons.py +0 -0
  51. {servify-0.0.43 → servify-0.0.45}/servify.egg-info/SOURCES.txt +0 -0
  52. {servify-0.0.43 → servify-0.0.45}/servify.egg-info/dependency_links.txt +0 -0
  53. {servify-0.0.43 → servify-0.0.45}/servify.egg-info/requires.txt +0 -0
  54. {servify-0.0.43 → servify-0.0.45}/servify.egg-info/top_level.txt +0 -0
  55. {servify-0.0.43 → servify-0.0.45}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: servify
3
- Version: 0.0.43
3
+ Version: 0.0.45
4
4
  Summary: A Python library that simplifies data manipulation and workflow development with PySpark in Databricks environments.
5
5
  Author: Felipe Pegoraro
6
6
  Author-email: felipepegoraro93@gmail.com
@@ -1,6 +1,6 @@
1
1
  from pyspark.sql import DataFrame
2
2
 
3
- from .commons.func_read import servify_read
3
+ from .commons.func_read.read import servify_read
4
4
  from .settings.log.servify_configs import set_logging, LOG_ENABLED
5
5
 
6
6
  _reader: servify_read | None = None
@@ -0,0 +1,9 @@
1
+ from .func_read import servify_read
2
+ from .func_read import ConfigError, DataValidationError, IoError
3
+
4
+ __all__ = [
5
+ "servify_read",
6
+ "ConfigError",
7
+ "DataValidationError",
8
+ "IoError",
9
+ ]
@@ -0,0 +1,91 @@
1
+ import csv
2
+ import re
3
+ from pyspark.sql import DataFrame
4
+
5
+
6
+ class CsvReader:
7
+
8
+ def __init__(self, spark, log):
9
+ self.spark = spark
10
+ self.log = log
11
+
12
+ def detect_delimiter(self, file_path: str) -> str:
13
+
14
+ encoding_detectado = self.obter_enconding(file_path)
15
+
16
+ self.log.info(f"Starting delimiter detection for file: {file_path}")
17
+
18
+ try:
19
+ with open(
20
+ file_path, "r", encoding=encoding_detectado, newline=""
21
+ ) as f:
22
+ linha = f.readline()
23
+ if not linha:
24
+ self.log.warning(f"File is empty: {file_path}.")
25
+
26
+ self.log.debug(
27
+ f"first line read for delimiter detection: {linha.rstrip("\n")}"
28
+ )
29
+
30
+ except Exception as e:
31
+ self.log.error(
32
+ f"Error reading file {file_path}: {e}", exc_info=True
33
+ )
34
+ raise ValueError(f"Error reading file {file_path}: {e}") from e
35
+
36
+ delimitadores = [",", ";", "\t", "|"]
37
+
38
+ contagem = {d: len(re.findall(re.escape(d), linha)) for d in delimitadores}
39
+ self.log.debug(f"Delimiter counts: {contagem}")
40
+
41
+ if all(c == 0 for c in contagem.values()):
42
+ self.log.warning(
43
+ f"No delimiters found in the first line of file: {file_path}. Trying csv.Sniffer...."
44
+ )
45
+ try:
46
+ dialect = csv.Sniffer().sniff(linha, delimiters="," ";|")
47
+ detected = dialect.delimiter
48
+ self.log.info(f"Delimiter detected by csv.Sniffer: {detected}")
49
+ return detected
50
+ except Exception as e:
51
+ self.log.error(
52
+ f"csv.Sniffer failed to detect delimiter for file {file_path}: {e}",
53
+ exc_info=True,
54
+ )
55
+ self.log.warning(
56
+ f"Using default delimiter ',' for file: {file_path}."
57
+ )
58
+ return ","
59
+
60
+ delimitador_detectado = max(contagem.items(), key=lambda kv: kv[1])[0]
61
+ self.log.info(
62
+ f"Delimiter detected: {delimitador_detectado} for file: {file_path}"
63
+ )
64
+ return delimitador_detectado
65
+
66
+ def read_csv(self, path: str, delimiter: str | None = None, encoding: str | None = None) -> DataFrame:
67
+
68
+ if delimiter or encoding:
69
+ self.log.info(
70
+ f"Using provided delimiter: '{delimiter}' and encoding: '{encoding}' for file: {path}."
71
+ )
72
+
73
+ else:
74
+ self.log.warning(
75
+ f"No delimiter or encoding provided. Starting auto-detection for file: {path}."
76
+ )
77
+
78
+ delimiter = ","
79
+ encoding = "utf-8"
80
+
81
+ self.log.info(f"Using default delimiter: '{delimiter}' and encoding: '{encoding}' for file: {path}.")
82
+
83
+ return (
84
+ self.spark.read
85
+ .option("header", True)
86
+ .option("inferSchema", "false")
87
+ .option("samplingRatio", 0.1)
88
+ .option("delimiter", delimiter)
89
+ .option("encoding", encoding)
90
+ .csv(path)
91
+ )
@@ -0,0 +1,33 @@
1
+ import chardet
2
+
3
+
4
+ class EncodingUtils:
5
+
6
+ def detect_encoding(self, file_path: str, *, sample_bytes: int = 4096) -> str:
7
+
8
+ self.log.info(f"Initialized encoding detection for file: {file_path}")
9
+
10
+ try:
11
+ with open(file_path, "rb") as f:
12
+ rawdata = f.read(sample_bytes)
13
+ result = chardet.detect(rawdata) or {}
14
+ encoding_detectado: str = result.get("encoding") or "utf-8"
15
+
16
+ conf = result.get("confidence")
17
+ self.log.debug(
18
+ f"Encoding detected: {encoding_detectado} with confidence: {conf} and language: {result.get('language')}"
19
+ )
20
+ except Exception as e:
21
+ self.log.error(
22
+ f"Error detecting encoding for file {file_path}: {e}",
23
+ exc_info=True,
24
+ )
25
+ raise ValueError(
26
+ f"Error detecting encoding for file {file_path}: {e}"
27
+ ) from e
28
+
29
+ self.log.info(
30
+ f"Encoding detection completed for file: {file_path} - Encoding: {encoding_detectado}"
31
+ )
32
+
33
+ return encoding_detectado
@@ -0,0 +1,208 @@
1
+ import pandas as pd
2
+ import os
3
+ import math
4
+ import re
5
+ from pyspark.sql import DataFrame
6
+ from pyspark.sql import types as T, functions as F
7
+ import numpy as np
8
+ from typing import List, Optional
9
+
10
+
11
+ class ExcelReader:
12
+
13
+ def __init__(self, spark, log):
14
+ self.spark = spark
15
+ self.log = log
16
+
17
+ def saniteze_columns(
18
+ self, header_cells: List, prefer_from_schema: Optional[T.StructType] = None
19
+ ) -> List[str]:
20
+
21
+ target_names: List[str] = (
22
+ [f.name for f in prefer_from_schema] if prefer_from_schema else []
23
+ )
24
+
25
+ seen = set()
26
+ safe_cols = []
27
+
28
+ def is_blank(x):
29
+ if x is None:
30
+ return True
31
+ try:
32
+ if isinstance(x, float) and math.isnan(x):
33
+ return True
34
+ except Exception:
35
+ pass
36
+ s = str(x).strip()
37
+ return s == "" or s.lower() == "nan"
38
+
39
+ for i, h in enumerate(header_cells):
40
+ if is_blank(h):
41
+ if target_names and i < len(target_names):
42
+ base = target_names[i]
43
+ else:
44
+ base = f"c_{i+1}"
45
+ else:
46
+ base = str(h).replace("\n", " ").replace("\r", " ").strip()
47
+
48
+ base = re.sub(r"\s+", "_", base)
49
+ base = re.sub(r"[^0-9a-zA-Z_]", "", base)
50
+
51
+ if base[0].isdigit():
52
+ base = f"c_{base}"
53
+
54
+ name = base
55
+ k = 1
56
+ while name in seen:
57
+ name = f"{base}__{k}"
58
+ k += 1
59
+ seen.add(name)
60
+ safe_cols.append(name)
61
+
62
+ return safe_cols
63
+
64
+ def concat_ps_dfs(
65
+ self, lista_files: List[str], schema: Optional[T.StructType] = None
66
+ ) -> DataFrame:
67
+ """
68
+ Concatena uma lista de DataFrames do Pandas on Spark API.
69
+ """
70
+
71
+ if not lista_files:
72
+ self.log.warning("No DataFrames provided for concatenation.")
73
+ raise ValueError("No DataFrames provided for concatenation.")
74
+
75
+ if len(lista_files) == 1:
76
+ self.log.info("Only one DataFrame provided, returning it directly.")
77
+ return self.read_xlsx_with_pandas(lista_files[0], schema=schema)
78
+
79
+ self.log.info(f"Concatenating {len(lista_files)} Pandas on Spark DataFrames")
80
+
81
+ df_final = self.read_xlsx_with_pandas(lista_files[0], schema=schema)
82
+ self.log.info(f"file: {lista_files[0]} read successfully.")
83
+
84
+ for file in lista_files[1:]:
85
+ df = self.read_xlsx_with_pandas(file, schema=schema)
86
+ df_final = df_final.unionByName(df, allowMissingColumns=True)
87
+ self.log.info(f"file: {file} read and concatenated successfully.")
88
+
89
+ self.log.info("DataFrames concatenated successfully.")
90
+ return df_final
91
+
92
+ def remove_header_rows(self, spark_df: DataFrame) -> DataFrame:
93
+ """
94
+ Remove linhas de cabeçalho duplicadas de um DataFrame do Spark.
95
+ """
96
+
97
+ self.log.info("Verify if having heands on lines to remove")
98
+
99
+ primeira_coluna = spark_df.columns[0]
100
+
101
+ valor_normalizado = F.lower(
102
+ F.regexp_replace(
103
+ F.trim(F.regexp_replace(F.col(primeira_coluna), "_", "")),
104
+ r"\s+",
105
+ " ",
106
+ )
107
+ )
108
+
109
+ nome_normalizado = primeira_coluna.lower().replace("_", "").strip()
110
+
111
+ existe_header = (
112
+ spark_df.filter(valor_normalizado == F.lit(nome_normalizado)).count() > 0
113
+ )
114
+
115
+ if existe_header:
116
+ self.log.info("Header rows detected, removing them")
117
+ spark_df_limpo = spark_df.filter(
118
+ valor_normalizado != F.lit(nome_normalizado)
119
+ )
120
+ self.log.info("Header rows removed successfully")
121
+ return spark_df_limpo
122
+
123
+ self.log.info("No header rows detected, returning original DataFrame")
124
+ return spark_df
125
+
126
+ def read_xlsx_with_pandas(
127
+ self, xlsx_path: str, schema: Optional[T.StructType] = None, sheet_name: int = 0
128
+ ):
129
+ """
130
+ Lê um arquivo .xlsx em um DataFrame do Pandas on Spark API.
131
+ """
132
+
133
+ self.log.info(f"Reading .xlsx file: {xlsx_path} by pandas")
134
+
135
+ try:
136
+ df = pd.read_excel(
137
+ xlsx_path,
138
+ engine="openpyxl",
139
+ header=0,
140
+ sheet_name=sheet_name,
141
+ )
142
+
143
+ valid_mask = df.notna().any(axis=1)
144
+ if not bool(valid_mask.all()):
145
+ self.log.warning(f"All rows are empty in .xlsx file: {xlsx_path}")
146
+ raise ValueError(f"All rows are empty in .xlsx file: {xlsx_path}")
147
+
148
+ first_valid_pos = int(np.argmax(valid_mask.to_numpy()))
149
+
150
+ except Exception as e:
151
+ self.log.warning(f"Error reading .xlsx file {xlsx_path} by pandas: {e}")
152
+ raise
153
+
154
+ header_raw = df.iloc[first_valid_pos].tolist()
155
+ safe_cols = self.saniteze_columns(header_raw, prefer_from_schema=schema)
156
+
157
+ df = df.dropna(how="all").reset_index(drop=True)
158
+
159
+ for c in range(len(safe_cols)):
160
+ col = df.columns[c]
161
+ df[col] = (
162
+ df[col]
163
+ .where(~df[col].isna(), None)
164
+ .map(lambda x: str(x) if x is not None else None)
165
+ )
166
+
167
+ if schema is None:
168
+ df.columns = safe_cols
169
+
170
+ self.log.info(
171
+ "Using safe conversion mode to avoid internal Serverless Arrow errors."
172
+ )
173
+
174
+ string_schema = T.StructType(
175
+ [T.StructField(c, T.StringType(), True) for c in df.columns]
176
+ )
177
+
178
+ try:
179
+ sdf = self.spark.createDataFrame(df, schema=string_schema)
180
+ except Exception as e:
181
+ self.log.error(
182
+ "Spark Serverless failed to create DataFrame even in safe mode."
183
+ )
184
+ self.log.debug(f"Internal Spark error: {e}")
185
+ raise
186
+
187
+ sdf = sdf.select([F.col(c).cast("string").alias(c) for c in sdf.columns])
188
+ sdf = sdf.withColumn("source_file", F.lit(os.path.basename(xlsx_path)))
189
+ sdf = self.remove_header_rows(sdf)
190
+
191
+ return sdf
192
+
193
+ df.columns = safe_cols
194
+
195
+ string_schema = T.StructType(
196
+ [T.StructField(c, T.StringType(), True) for c in df.columns]
197
+ )
198
+
199
+ sdf = self.spark.createDataFrame(df, schema=string_schema)
200
+
201
+ sdf = sdf.withColumn("source_file", F.lit(os.path.basename(xlsx_path)))
202
+
203
+ sdf = self.commons_shared.aplicar_schema_df(sdf, schema)
204
+
205
+ sdf = self.remove_header_rows(sdf)
206
+
207
+ return sdf
208
+
@@ -22,10 +22,10 @@ class helper_reading_data(HelperBase):
22
22
  # composição
23
23
  self.paths = PathUtils()
24
24
  self.encoding = EncodingUtils()
25
- self.excel = ExcelReader(self.spark)
26
- self.csv = CsvReader(self.spark)
27
- self.json = JsonReader(self.spark)
28
- self.parquet = ParquetReader(self.spark)
25
+ self.excel = ExcelReader(self.spark, self.log)
26
+ self.csv = CsvReader(self.spark, self.log)
27
+ self.json = JsonReader(self.spark, self.log)
28
+ self.parquet = ParquetReader(self.spark, self.log)
29
29
  self.schema_utils = SchemaUtils(self.commons_shared)
30
30
 
31
31
  # métodos de fachada, mantendo a API que o read.py espera
@@ -0,0 +1,50 @@
1
+ from pyspark.sql import DataFrame
2
+
3
+
4
+ class JsonReader:
5
+
6
+ def __init__(self, spark, log):
7
+ self.spark = spark
8
+ self.log = log
9
+
10
+ def detectar_json_multiline(self, path: str) -> bool:
11
+
12
+ self.log.info(f"Starting JSON multiline detection for file: {path}")
13
+
14
+ encoding_detectado = self.obter_enconding(path)
15
+
16
+ try:
17
+ with open(path, "r", encoding=encoding_detectado) as f:
18
+ linhas = f.readlines()
19
+ self.log.info(f"File {path} read successfully.")
20
+ except Exception as e:
21
+ self.log.error(f"Error reading file {path}: {e}")
22
+ raise ValueError(f"Error reading file {path}: {e}") from e
23
+
24
+ primeira_linha = linhas[0].strip()
25
+ self.log.debug(f"First line for JSON multiline detection: {primeira_linha}")
26
+
27
+ if primeira_linha.startswith("{") or (
28
+ primeira_linha.startswith("[") and len(linhas) >= 1
29
+ ):
30
+ self.log.info(f"JSON multiline detected for file: {path}")
31
+ return True
32
+ self.log.info(f"JSON single line detected for file: {path}")
33
+ return False
34
+
35
+ def read_json(self, path: str) -> DataFrame:
36
+
37
+ arquivo_escolhido = self.resolve_latest_file(path)
38
+
39
+ multiline = self.detectar_json_multiline(arquivo_escolhido)
40
+
41
+ if multiline is not None:
42
+
43
+ return self.spark.read.option("multiline", str(multiline).lower()).json(
44
+ path
45
+ )
46
+ else:
47
+ self.log.info(
48
+ f"Using default JSON multiline 'false' for {path}."
49
+ )
50
+ return self.spark.read.json(path)
@@ -3,8 +3,10 @@ from pyspark.sql import DataFrame
3
3
 
4
4
  class ParquetReader:
5
5
 
6
- def __init__(self, spark):
6
+ def __init__(self, spark, log):
7
7
  self.spark = spark
8
+ self.log = log
8
9
 
9
10
  def read_parquet(self, path: str) -> DataFrame:
11
+ self.log.info(f"Reading Parquet file: {path}")
10
12
  return self.spark.read.parquet(path)
@@ -0,0 +1,110 @@
1
+ import glob
2
+ import os
3
+ from typing import List
4
+
5
+
6
+ class PathUtils:
7
+
8
+ def list_xlsx_paths(self, directory: str) -> List[str]:
9
+ """
10
+ Lista todos os arquivos .xlsx em um diretório especificado.
11
+ """
12
+
13
+ self.log.info(f"Listing .xlsx files in directory: {directory}")
14
+
15
+ if os.path.isfile(directory):
16
+ if directory.lower().endswith(".xlsx"):
17
+ self.log.info(f"Single .xlsx file found: {directory}")
18
+ return [directory]
19
+
20
+ self.log.error(f"The specified path is a file but not .xlsx: {directory}")
21
+ raise ValueError(f"The specified path is a file but not .xlsx: {directory}")
22
+
23
+ if os.path.isdir(directory):
24
+ paths = [
25
+ os.path.join(directory, nome)
26
+ for nome in os.listdir(directory)
27
+ if nome.lower().endswith(".xlsx")
28
+ ]
29
+ if not paths:
30
+ self.log.error(f"No .xlsx files found in directory: {directory}")
31
+ raise FileNotFoundError(
32
+ f"No .xlsx files found in directory: {directory}"
33
+ )
34
+
35
+ self.log.info(f"Found {len(paths)} .xlsx files in directory: {directory}")
36
+ return paths
37
+
38
+ self.log.error(
39
+ f"The specified path is neither a file nor a directory: {directory}"
40
+ )
41
+ raise ValueError(
42
+ f"The specified path is neither a file nor a directory: {directory}"
43
+ )
44
+
45
+ def resolve_accessible_path(self, path: str, dbutils) -> str:
46
+ """
47
+ Valida/resolve um path para leitura DBFS ou 'file:'.
48
+ - Com wildcard: garante que existe ao menos um arquivo, mantém wildcard para leitura.
49
+ - Sem wildcard: tenta DBFS, se não, tenta 'file:' se nada der certo, lança FileNotFoundError
50
+ """
51
+
52
+ if "*" in path:
53
+ arquivos = glob.glob(path.replace("file:", ""))
54
+ if not arquivos:
55
+ raise FileNotFoundError(f"No file founded in: {path}")
56
+
57
+ primeiro = arquivos[0]
58
+ try:
59
+ dbutils.fs.ls(primeiro)
60
+ except Exception:
61
+ arquivo_file = f"file:{primeiro}"
62
+ try:
63
+ dbutils.fs.ls(arquivo_file)
64
+ except Exception as exc_file:
65
+ raise FileNotFoundError(
66
+ f"File '{arquivo_file}' is not accessible by DBFS netheir 'file:'."
67
+ ) from exc_file
68
+
69
+ return path
70
+
71
+ try:
72
+ dbutils.fs.ls(path)
73
+ return path
74
+ except Exception:
75
+ path_file = f"file:{path}"
76
+ try:
77
+ dbutils.fs.ls(path_file)
78
+ return path_file
79
+ except Exception as exc_file:
80
+ raise FileNotFoundError(
81
+ f"File '{path_file}' is not accessible by DBFS netheir 'file:'."
82
+ ) from exc_file
83
+
84
+ def resolve_latest_file(self, path: str) -> str:
85
+
86
+ self.log.debug(f"Resolving path: {path}")
87
+
88
+ try:
89
+ path_resolvido = path.replace("file:", "")
90
+ except Exception as e:
91
+ self.log.error(f"Error resolving path: {e}")
92
+ raise ValueError(f"Error resolving path: {e}") from e
93
+
94
+ if "*" in path_resolvido:
95
+ arquivos = glob.glob(path_resolvido)
96
+ self.log.debug(f"Found files with wildcard: {arquivos}")
97
+
98
+ if not arquivos:
99
+ self.log.error(
100
+ f"No files found for path with wildcard: {path_resolvido}"
101
+ )
102
+ raise FileNotFoundError(f"No files found for path: {path_resolvido}")
103
+
104
+ arquivos.sort(key=os.path.getmtime, reverse=True)
105
+ escolhido = arquivos[0]
106
+ self.log.info(f"Latest file selected: {escolhido}")
107
+ return escolhido
108
+
109
+ self.log.info(f"Path resolved without wildcard: {path_resolvido}")
110
+ return path_resolvido
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: servify
3
- Version: 0.0.43
3
+ Version: 0.0.45
4
4
  Summary: A Python library that simplifies data manipulation and workflow development with PySpark in Databricks environments.
5
5
  Author: Felipe Pegoraro
6
6
  Author-email: felipepegoraro93@gmail.com
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as arq:
5
5
 
6
6
  setup(
7
7
  name="servify",
8
- version="0.0.43",
8
+ version="0.0.45",
9
9
  license="MIT",
10
10
  author="Felipe Pegoraro",
11
11
  author_email="felipepegoraro93@gmail.com",
@@ -1,23 +0,0 @@
1
- import csv
2
- from pyspark.sql import DataFrame
3
-
4
-
5
- class CsvReader:
6
-
7
- def __init__(self, spark):
8
- self.spark = spark
9
-
10
- def detect_delimiter(self, file_path: str) -> str:
11
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
12
- sample = f.read(2048)
13
- return csv.Sniffer().sniff(sample).delimiter
14
-
15
- def read_csv(self, path: str, delimiter: str | None = None) -> DataFrame:
16
- if delimiter is None:
17
- delimiter = self.detect_delimiter(path)
18
- return (
19
- self.spark.read
20
- .option("header", True)
21
- .option("delimiter", delimiter)
22
- .csv(path)
23
- )
@@ -1,9 +0,0 @@
1
- import chardet
2
-
3
-
4
- class EncodingUtils:
5
-
6
- def detect_encoding(self, file_path: str) -> str:
7
- with open(file_path, "rb") as f:
8
- raw = f.read(50000)
9
- return chardet.detect(raw)["encoding"] or "utf-8"
@@ -1,15 +0,0 @@
1
- import pandas as pd
2
- from pyspark.sql import DataFrame
3
- from pyspark.sql import types as T
4
-
5
-
6
- class ExcelReader:
7
-
8
- def __init__(self, spark):
9
- self.spark = spark
10
-
11
- def concat_ps_dfs(self, paths, schema: T.StructType | None) -> DataFrame:
12
- pdfs = [pd.read_excel(p) for p in paths]
13
- pdf = pd.concat(pdfs, ignore_index=True)
14
- return self.spark.createDataFrame(pdf, schema=schema)
15
-
@@ -1,10 +0,0 @@
1
- from pyspark.sql import DataFrame
2
-
3
-
4
- class JsonReader:
5
-
6
- def __init__(self, spark):
7
- self.spark = spark
8
-
9
- def read_json(self, path: str) -> DataFrame:
10
- return self.spark.read.option("multiLine", True).json(path)
@@ -1,14 +0,0 @@
1
- import glob
2
- import os
3
- from typing import List
4
-
5
-
6
- class PathUtils:
7
-
8
- def list_xlsx_paths(self, directory: str) -> List[str]:
9
- return glob.glob(os.path.join(directory, "*.xlsx"))
10
-
11
- def resolve_accessible_path(self, path: str) -> str:
12
- if path.startswith("dbfs:"):
13
- return path.replace("dbfs:", "/dbfs")
14
- return path
File without changes
File without changes
File without changes
File without changes
File without changes