streamfuels 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,99 @@
1
+ Metadata-Version: 2.4
2
+ Name: streamfuels
3
+ Version: 0.1.0
4
+ Summary: Data processing and analysis tools for fuel market research
5
+ Home-page: https://github.com/streamfuels/streamfuels
6
+ Author: StreamFuels
7
+ Author-email: lucascstxv@gmail.com
8
+ License: MIT
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.9
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: pandas>=1.2.0
15
+ Requires-Dist: requests>=2.25.0
16
+ Requires-Dist: beautifulsoup4>=4.9.0
17
+ Requires-Dist: unidecode>=1.1.1
18
+ Requires-Dist: numpy>=1.19.0
19
+ Requires-Dist: editdistance>=0.5.3
20
+ Requires-Dist: setuptools
21
+ Requires-Dist: tqdm==4.65.0
22
+ Dynamic: author
23
+ Dynamic: author-email
24
+ Dynamic: classifier
25
+ Dynamic: description
26
+ Dynamic: description-content-type
27
+ Dynamic: home-page
28
+ Dynamic: license
29
+ Dynamic: requires-dist
30
+ Dynamic: requires-python
31
+ Dynamic: summary
32
+
33
+ # StreamFuels
34
+
35
+ StreamFuels is a collection of tools for processing and analyzing fuel market data, focusing on petroleum derivatives, natural gas, and biofuels market across different regions of Brazil.
36
+
37
+ ***monthly_sales_state()***:
38
+ Monthly fuel sales data by state from the ANP database
39
+
40
+ ***yearly_sales_state()***:
41
+ Yearly fuel sales data by state from ANP database
42
+
43
+ ***yearly_sales_city()***:
44
+ Yearly fuel sales data by city from ANP database
45
+
46
+ ***monthly_operations_state()***:
47
+ Monthly oil production, NGL production, natural gas production, reinjection, flaring and losses, self-consumption, and available natural gas. It provides a comprehensive view of petroleum and gas operations.
48
+
49
+
50
+
51
+ <!-- ## Installation
52
+
53
+ ```bash
54
+ pip install streamfuels
55
+ ``` -->
56
+
57
+
58
+ To run locally, in your target python environment and in this project folder type:
59
+ ```bash
60
+ pip install -e .
61
+ ```
62
+
63
+
64
+ After that you can import using the target python environment:
65
+
66
+ ```python
67
+ from streamfuels.datasets import DatasetLoader
68
+ loader = DatasetLoader()
69
+ result, flag = loader.yearly_sales_state()
70
+
71
+ df, metadata = loader.read_tsf(path_tsf=result)
72
+ ```
73
+
74
+ ### Yearly sales of petroleum derivatives in the states of Brazil.
75
+ ```python
76
+ result, flag = loader.yearly_sales_state()
77
+ ```
78
+ ![image](https://github.com/user-attachments/assets/ab1d0ac8-9574-4229-81e6-2e3ef32e959c)
79
+
80
+ ### Monthly sales of petroleum derivatives in the states of Brazil.
81
+ ```python
82
+ result, flag = loader.monthly_sales_state()
83
+ ```
84
+ ![image](https://github.com/user-attachments/assets/4894d0cf-eb92-421b-8b8a-d0a1522ccc0d)
85
+
86
+ ### Monthly oil and gas operations in the states of Brazil.
87
+ ```python
88
+ result, flag = loader.monthly_operations_state()
89
+ ```
90
+ ![image](https://github.com/user-attachments/assets/ab9b18b5-54ee-41f8-8948-9458b6e96343)
91
+
92
+ ### Yearly sales of petroleum derivatives in the cities of Brazil.
93
+ ```python
94
+ result, flag = loader.yearly_sales_city()
95
+ ```
96
+ ![image](https://github.com/user-attachments/assets/26ac0d96-73f9-43a8-b9bf-47106cafeba4)
97
+
98
+
99
+
@@ -0,0 +1,67 @@
1
+ # StreamFuels
2
+
3
+ StreamFuels is a collection of tools for processing and analyzing fuel market data, focusing on petroleum derivatives, natural gas, and biofuels market across different regions of Brazil.
4
+
5
+ ***monthly_sales_state()***:
6
+ Monthly fuel sales data by state from the ANP database
7
+
8
+ ***yearly_sales_state()***:
9
+ Yearly fuel sales data by state from ANP database
10
+
11
+ ***yearly_sales_city()***:
12
+ Yearly fuel sales data by city from ANP database
13
+
14
+ ***monthly_operations_state()***:
15
+ Monthly oil production, NGL production, natural gas production, reinjection, flaring and losses, self-consumption, and available natural gas. It provides a comprehensive view of petroleum and gas operations.
16
+
17
+
18
+
19
+ <!-- ## Installation
20
+
21
+ ```bash
22
+ pip install streamfuels
23
+ ``` -->
24
+
25
+
26
+ To run locally, in your target python environment and in this project folder type:
27
+ ```bash
28
+ pip install -e .
29
+ ```
30
+
31
+
32
+ After that you can import using the target python environment:
33
+
34
+ ```python
35
+ from streamfuels.datasets import DatasetLoader
36
+ loader = DatasetLoader()
37
+ result, flag = loader.yearly_sales_state()
38
+
39
+ df, metadata = loader.read_tsf(path_tsf=result)
40
+ ```
41
+
42
+ ### Yearly sales of petroleum derivatives in the states of Brazil.
43
+ ```python
44
+ result, flag = loader.yearly_sales_state()
45
+ ```
46
+ ![image](https://github.com/user-attachments/assets/ab1d0ac8-9574-4229-81e6-2e3ef32e959c)
47
+
48
+ ### Monthly sales of petroleum derivatives in the states of Brazil.
49
+ ```python
50
+ result, flag = loader.monthly_sales_state()
51
+ ```
52
+ ![image](https://github.com/user-attachments/assets/4894d0cf-eb92-421b-8b8a-d0a1522ccc0d)
53
+
54
+ ### Monthly oil and gas operations in the states of Brazil.
55
+ ```python
56
+ result, flag = loader.monthly_operations_state()
57
+ ```
58
+ ![image](https://github.com/user-attachments/assets/ab9b18b5-54ee-41f8-8948-9458b6e96343)
59
+
60
+ ### Yearly sales of petroleum derivatives in the cities of Brazil.
61
+ ```python
62
+ result, flag = loader.yearly_sales_city()
63
+ ```
64
+ ![image](https://github.com/user-attachments/assets/26ac0d96-73f9-43a8-b9bf-47106cafeba4)
65
+
66
+
67
+
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,31 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='streamfuels',
5
+ version='0.1.0',
6
+ packages=find_packages(),
7
+ install_requires=[
8
+ 'pandas>=1.2.0',
9
+ 'requests>=2.25.0',
10
+ 'beautifulsoup4>=4.9.0',
11
+ 'unidecode>=1.1.1',
12
+ 'numpy>=1.19.0',
13
+ 'editdistance>=0.5.3',
14
+ 'setuptools',
15
+ 'tqdm==4.65.0'
16
+ ],
17
+ author='StreamFuels',
18
+ author_email='lucascstxv@gmail.com',
19
+ description='Data processing and analysis tools for fuel market research',
20
+ long_description=open('README.md').read(),
21
+ long_description_content_type='text/markdown',
22
+ url='https://github.com/streamfuels/streamfuels',
23
+ classifiers=[
24
+ 'Programming Language :: Python :: 3',
25
+ 'License :: OSI Approved :: MIT License',
26
+ 'Operating System :: OS Independent',
27
+ ],
28
+ python_requires='>=3.9',
29
+ license='MIT',
30
+ license_files='LICENSE',
31
+ )
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,4 @@
1
+ from .dataset_loader import DatasetLoader
2
+ from .extract import download_anp_data
3
+
4
+ __all__ = ['DatasetLoader', 'download_anp_data']
@@ -0,0 +1,368 @@
1
+
2
+ import os
3
+ import zipfile
4
+ from unidecode import unidecode
5
+ import re
6
+ import pandas as pd
7
+ import numpy as np
8
+
9
+ def znorm(x):
10
+ std = np.std(x)
11
+ if std == 0:
12
+ return x - np.mean(x)
13
+ return (x - np.mean(x)) / std
14
+
15
+ def translate_fuel_name(fuel_name):
16
+ fuel_mapping = {
17
+ 'ethanol': 'Etanol hidratado',
18
+ 'gasoline-r': 'Gasolina C',
19
+ 'gasoline-a': 'Gasolina de aviação',
20
+ 'fuel oil': 'Óleo combustível',
21
+ 'LPG': 'GLP',
22
+ 'diesel': 'Óleo diesel',
23
+ 'kerosene-i': 'Querosene iluminante',
24
+ 'kerosene-a': 'Querosene de aviação',
25
+ 'etanol': 'ethanol'
26
+ }
27
+ if fuel_name.lower() not in fuel_mapping:
28
+ print(f"Fuel name '{fuel_name}' not found in mapping.")
29
+ return fuel_mapping.get(fuel_name.lower(), "Invalid")
30
+
31
+ def prod_to_en(prod):
32
+ prods = {
33
+ 'petroleo': 'petroleum',
34
+ 'lgn': 'NGL',
35
+ 'gasnatural': 'natural gas'
36
+
37
+ }
38
+ return prods.get(prod.lower(), "Invalid")
39
+
40
+
41
+ def fuel_pt_to_en(fuel_name):
42
+ fuel_mapping = {
43
+ 'etanolhidratado':'ethanol',
44
+ 'gasolinac':'gasoline-r',
45
+ 'gasolinadeaviacao':'gasoline-a',
46
+ 'oleocombustivel':'fuel oil',
47
+ 'glp':'LPG',
48
+ 'oleodiesel':'diesel',
49
+ 'queroseneiluminante':'kerosene-i',
50
+ 'querosenedeaviacao':'kerosene-a',
51
+ 'asfalto':'asphalt',
52
+ 'etanol': 'ethanol'
53
+ }
54
+ if fuel_name.lower() not in fuel_mapping:
55
+ print(f"Fuel name '{fuel_name}' not found in mapping.")
56
+ return fuel_mapping.get(fuel_name.lower(), "Invalid")
57
+
58
+
59
+ def get_default_download_dir():
60
+ """Return the default directory for downloads."""
61
+ default_dir = os.path.join(os.path.expanduser("~"), ".streamfuels")
62
+ if not os.path.exists(default_dir):
63
+ os.makedirs(default_dir)
64
+ return default_dir
65
+ def unzip_and_delete(zip_file_path):
66
+ """
67
+ Unzips a ZIP file and deletes the original ZIP file after extraction.
68
+
69
+ Parameters:
70
+ - zip_file_path: Path to the ZIP file.
71
+ """
72
+ # Check if the file exists and is a zip file
73
+ if not zipfile.is_zipfile(zip_file_path):
74
+ print(f"The file at {zip_file_path} is not a valid zip file.")
75
+ return
76
+
77
+ try:
78
+ # Create a ZipFile object in read mode
79
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
80
+ # Extract all the contents into the directory of the zip file
81
+ extract_path = os.path.dirname(zip_file_path)
82
+ zip_ref.extractall(extract_path)
83
+ print(f"Extracted all contents to {extract_path}")
84
+
85
+ # Remove the original ZIP file
86
+ os.remove(zip_file_path)
87
+ print(f"Deleted original zip file: {zip_file_path}")
88
+ except Exception as e:
89
+ print(f"An error occurred: {e}")
90
+
91
+ def parse_string(string):
92
+ return re.sub(r'[^a-zA-Z0-9]', '', unidecode(str(string).lower()))
93
+
94
+ def mes_para_numero(mes):
95
+ """Convert month name to number, handling different input types.
96
+
97
+ Args:
98
+ mes: Month name as string, float, or other type. If float, will be converted to string first.
99
+
100
+ Returns:
101
+ str: Two-digit month number as string
102
+ """
103
+ # Handle different input types
104
+ if mes is None or pd.isna(mes):
105
+ return '01' # Default to January if None or NaN
106
+
107
+ # Convert to string if needed
108
+ if not isinstance(mes, str):
109
+ mes = str(mes)
110
+
111
+ # Remove decimal part if present
112
+ if '.' in mes:
113
+ mes = mes.split('.')[0]
114
+
115
+ # Map of month names to numbers
116
+ meses = {
117
+ 'JAN': '01', 'FEV': '02', 'MAR': '03', 'ABR': '04',
118
+ 'MAI': '05', 'JUN': '06', 'JUL': '07', 'AGO': '08',
119
+ 'SET': '09', 'OUT': '10', 'NOV': '11', 'DEZ': '12',
120
+ # Add direct number mapping for numeric months
121
+ '1': '01', '2': '02', '3': '03', '4': '04',
122
+ '5': '05', '6': '06', '7': '07', '8': '08',
123
+ '9': '09', '10': '10', '11': '11', '12': '12'
124
+ }
125
+
126
+ # Try to get the month number, defaulting to '01' if not found
127
+ try:
128
+ return meses.get(mes.upper(), '01')
129
+ except AttributeError:
130
+ # If any other error occurs, default to January
131
+ return '01'
132
+
133
+ def ensure_folder_exists(parts):
134
+ """
135
+ Checks if a folder exists, and creates it (including any necessary parent directories)
136
+ if it doesn't.
137
+
138
+ Parameters:
139
+ - folder_path: The path to the folder to check and create.
140
+ """
141
+ file_path = get_default_download_dir()
142
+ p = os.path.join(file_path, *parts)
143
+
144
+ if not os.path.exists(p):
145
+ os.makedirs(p)
146
+ return p
147
+
148
+ def estado_para_sigla(estado):
149
+ # Mapeamento dos nomes dos estados para suas siglas
150
+ estados = {
151
+ 'acre': 'ac',
152
+ 'alagoas': 'al',
153
+ 'amapa': 'ap',
154
+ 'amazonas': 'am',
155
+ 'bahia': 'ba',
156
+ 'ceara': 'ce',
157
+ 'distritofederal': 'df',
158
+ 'espiritosanto': 'es',
159
+ 'goias': 'go',
160
+ 'maranhao': 'ma',
161
+ 'matogrosso': 'mt',
162
+ 'matogrossodosul': 'ms',
163
+ 'minasgerais': 'mg',
164
+ 'para': 'pa',
165
+ 'paraiba': 'pb',
166
+ 'parana': 'pr',
167
+ 'pernambuco': 'pe',
168
+ 'piaui': 'pi',
169
+ 'riodejaneiro': 'rj',
170
+ 'riograndedonorte': 'rn',
171
+ 'riograndedosul': 'rs',
172
+ 'rondonia': 'ro',
173
+ 'roraima': 'rr',
174
+ 'santacatarina': 'sc',
175
+ 'saopaulo': 'sp',
176
+ 'sergipe': 'se',
177
+ 'tocantins': 'to'
178
+ }
179
+
180
+ return estados.get(estado, 'Undefined')
181
+
182
+ def obter_max_min_datas(df, col_data, mes_ou_ano):
183
+ """Get the maximum and minimum dates from a DataFrame column.
184
+
185
+ Args:
186
+ df: DataFrame containing the data
187
+ col_data: Column name containing date information
188
+ mes_ou_ano: Type of date information ('ano' for year, any other value for month)
189
+
190
+ Returns:
191
+ tuple: (max_date, min_date)
192
+ """
193
+ # Make a copy to avoid changing the original DataFrame
194
+ date_series = df[col_data].copy()
195
+
196
+ # Filter out any NaN values
197
+ date_series = date_series[~pd.isna(date_series)]
198
+
199
+ if date_series.empty:
200
+ print(f"Warning: No valid dates found in column {col_data}")
201
+ # Return default values if no valid dates are found
202
+ return ('2020', '2000') if mes_ou_ano == 'ano' else ('202001', '200001')
203
+
204
+ # Process differently based on date type
205
+ if mes_ou_ano == 'ano':
206
+ try:
207
+ # Try to convert directly to int
208
+ date_series = date_series.astype(int)
209
+ except ValueError:
210
+ # If that fails, clean the data first
211
+ date_series = date_series.astype(str)
212
+ # Extract just the year part if there's a decimal
213
+ date_series = date_series.apply(lambda x: x.split('.')[0] if '.' in x else x)
214
+ # Remove any non-digit characters
215
+ date_series = date_series.str.extract(r'(\d+)', expand=False)
216
+ # Convert to integer
217
+ date_series = pd.to_numeric(date_series, errors='coerce')
218
+ # Drop any NaN values that might have been introduced
219
+ date_series = date_series.dropna()
220
+
221
+ max_date = date_series.max()
222
+ min_date = date_series.min()
223
+ else:
224
+ # For monthly data
225
+ try:
226
+ # Clean the data first
227
+ date_series = date_series.astype(str)
228
+ # Remove dashes to get format YYYYMM
229
+ clean_dates = date_series.str.replace("-", "")
230
+ # Convert to numeric and handle errors
231
+ numeric_dates = pd.to_numeric(clean_dates, errors='coerce')
232
+ # Drop any NaN values
233
+ numeric_dates = numeric_dates.dropna()
234
+
235
+ max_date = numeric_dates.max()
236
+ min_date = numeric_dates.min()
237
+ except Exception as e:
238
+ print(f"Error processing dates: {e}")
239
+ print("Sample dates:", date_series.head())
240
+ # Use default values if processing fails
241
+ max_date = 202001
242
+ min_date = 200001
243
+
244
+ return max_date, min_date
245
+
246
+ def kg_to_m3(material, kg):
247
+ #https://www.gov.br/anp/pt-br/centrais-de-conteudo/publicacoes/anuario-estatistico/arquivos-anuario-estatistico-2022/outras-pecas-documentais/fatores-conversao-2022.pdf
248
+ densidades = { #em TERA / M3
249
+ 'etanolanidro': 0.79100,
250
+ 'etanolhidratado': 0.80900,
251
+ 'asfalto': 1025.00,
252
+ 'biodieselb100': 880.00,
253
+ 'gasolinac': 754.25,
254
+ 'gasolinadeaviacao': 726.00,
255
+ 'glp': 552.00,
256
+ 'lgn': 580.00,
257
+ 'oleodiesel': 840.00,
258
+ 'oleocombustivel': 1013.00,
259
+ 'petroleo': 849.76,
260
+ 'querosenedeaviacao': 799.00,
261
+ 'queroseneiluminante': 799.00,
262
+ 'solventes': 741.00
263
+ }
264
+
265
+ if material in densidades:
266
+ densidade = densidades[material] / 1e3 # Convertendo para kg/m³
267
+ m3 = kg / densidade
268
+ return m3
269
+ else:
270
+ return "Material não encontrado na lista."
271
+
272
+ def registrar_meses_duplicados(df, produto, local, tempo):
273
+ #os.remove(f'timestamps_duplicadas_{tempo}.csv') if os.path.exists(f'timestamps_duplicadas_{tempo}.csv') else None
274
+ df_c = df.copy()
275
+ df_c['duplicatas'] = df_c.groupby('timestamp')['timestamp'].transform('count') - 1
276
+ df_c = df_c[df_c['duplicatas']>=1]
277
+ df_c['derivado'] = produto
278
+ df_c['local'] = local
279
+ df_c.to_csv(f'timestamps_duplicadas_{tempo}.csv', mode='a', header=False, index=False)
280
+
281
+ def combinar_valores_unicos_colunas(df, colunas):
282
+ # Agrupar pelo conjunto de colunas e resetar o índice para transformar em DataFrame
283
+ df_unicos = df[colunas].drop_duplicates().reset_index(drop=True)
284
+
285
+ # Converter o DataFrame resultante em uma lista de tuplas
286
+ combinacoes_existentes = [tuple(x) for x in df_unicos.values]
287
+
288
+ return combinacoes_existentes
289
+
290
+ def first_non_nan_value(df, column_name):
291
+ """
292
+ Find the first non-NaN value in the specified column of a DataFrame.
293
+
294
+ Args:
295
+ df (DataFrame): The pandas DataFrame.
296
+ column_name (str): The name of the column to search for non-NaN values.
297
+
298
+ Returns:
299
+ The first non-NaN value in the specified column, or None if no non-NaN values are found.
300
+ """
301
+ first_non_nan_index = df[column_name].first_valid_index()
302
+ if first_non_nan_index is not None:
303
+ return df[column_name].iloc[first_non_nan_index]
304
+ else:
305
+ return None
306
+
307
+ def last_non_nan_value(df, column_name):
308
+ """
309
+ Find the last non-NaN value in the specified column of a DataFrame.
310
+
311
+ Args:
312
+ df (DataFrame): The pandas DataFrame.
313
+ column_name (str): The name of the column to search for non-NaN values.
314
+
315
+ Returns:
316
+ The last non-NaN value in the specified column, or None if no non-NaN values are found.
317
+ """
318
+ last_non_nan_index = df[column_name].last_valid_index()
319
+ if last_non_nan_index is not None:
320
+ return df[column_name].iloc[last_non_nan_index]
321
+ else:
322
+ return None
323
+
324
+ def find_first_sequence(arr):
325
+ """
326
+ Find the first sequence of consecutive elements in the given array.
327
+
328
+ Args:
329
+ arr (list): The input list of integers.
330
+
331
+ Returns:
332
+ list: The list containing the first sequence of consecutive elements.
333
+ """
334
+ if not arr:
335
+ return [] # Return an empty list if the input array is empty
336
+
337
+ sequence = [arr[0]] # Start with the first element
338
+ for i in range(1, len(arr)):
339
+ # If the current element is consecutive with the previous one, add it to the sequence
340
+ if arr[i] == sequence[-1] + 1:
341
+ sequence.append(arr[i])
342
+ else:
343
+ break # Break the loop when the sequence breaks
344
+ return sequence
345
+
346
+ def find_last_sequence(arr):
347
+ """
348
+ Find the last sequence of consecutive elements in the given array.
349
+
350
+ Args:
351
+ arr (list): The input list of integers.
352
+
353
+ Returns:
354
+ list: The list containing the last sequence of consecutive elements.
355
+ """
356
+ if not arr:
357
+ return [] # Return an empty list if the input array is empty
358
+
359
+ sequence = [arr[-1]] # Start with the last element
360
+ for i in range(len(arr) - 2, -1, -1):
361
+ # If the current element is consecutive with the next one, add it to the sequence
362
+ if arr[i] == sequence[-1] - 1:
363
+ sequence.append(arr[i])
364
+ else:
365
+ break # Break the loop when the sequence breaks
366
+ sequence.reverse() # Reverse the sequence to have it in ascending order
367
+ return sequence
368
+