IPFpy 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ipfpy-0.1.0/IPFpy/__init__.py +5 -0
- ipfpy-0.1.0/IPFpy/ipf.py +402 -0
- ipfpy-0.1.0/IPFpy.egg-info/PKG-INFO +131 -0
- ipfpy-0.1.0/IPFpy.egg-info/SOURCES.txt +10 -0
- ipfpy-0.1.0/IPFpy.egg-info/dependency_links.txt +1 -0
- ipfpy-0.1.0/IPFpy.egg-info/requires.txt +3 -0
- ipfpy-0.1.0/IPFpy.egg-info/top_level.txt +1 -0
- ipfpy-0.1.0/PKG-INFO +131 -0
- ipfpy-0.1.0/README.md +106 -0
- ipfpy-0.1.0/setup.cfg +4 -0
- ipfpy-0.1.0/setup.py +31 -0
- ipfpy-0.1.0/tests/test_ipf.py +38 -0
ipfpy-0.1.0/IPFpy/ipf.py
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Python package to perform iterative proportional fitting on data tables (RAS, matrix scaling)
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from itertools import combinations
|
|
6
|
+
import functools
|
|
7
|
+
from time import perf_counter
|
|
8
|
+
import itertools
|
|
9
|
+
import numpy as np
|
|
10
|
+
import pandas as pd
|
|
11
|
+
import duckdb
|
|
12
|
+
|
|
13
|
+
def generate_random_table(n_dim,
|
|
14
|
+
n_cat,
|
|
15
|
+
scale=1):
|
|
16
|
+
"""
|
|
17
|
+
Generate a pandas dataframe with random values
|
|
18
|
+
"""
|
|
19
|
+
#generate n_dim columns each with n_cat values
|
|
20
|
+
sets = [set(range(n_cat)) for _ in range(n_dim)]
|
|
21
|
+
cartesian_product = list(itertools.product(*sets))
|
|
22
|
+
df = pd.DataFrame(cartesian_product, columns=[*range(n_dim)])
|
|
23
|
+
#generate random values between 0 and scale
|
|
24
|
+
df["value"] = np.random.rand(len(df)) * scale
|
|
25
|
+
return df
|
|
26
|
+
|
|
27
|
+
def get_unique_col_name(df, base_name):
|
|
28
|
+
"""
|
|
29
|
+
Generate a unique column name
|
|
30
|
+
"""
|
|
31
|
+
i = 1
|
|
32
|
+
new_name = base_name
|
|
33
|
+
while new_name in df.columns:
|
|
34
|
+
new_name = f"{base_name}_{i}"
|
|
35
|
+
i += 1
|
|
36
|
+
return new_name
|
|
37
|
+
|
|
38
|
+
def agg_by_sql(df: pd.DataFrame,
|
|
39
|
+
by,
|
|
40
|
+
var,
|
|
41
|
+
id):
|
|
42
|
+
if by is None or not by:
|
|
43
|
+
# Aggregate over the entire dataset
|
|
44
|
+
query = f"""
|
|
45
|
+
SELECT
|
|
46
|
+
FSUM({var}) AS {var},
|
|
47
|
+
LIST({id}) AS {id}
|
|
48
|
+
FROM 'df'
|
|
49
|
+
"""
|
|
50
|
+
else:
|
|
51
|
+
# Aggregate with grouping
|
|
52
|
+
group_by_columns = ", ".join(map(lambda x: '"'+str(x)+'"' if isinstance(x, int) else str(x) , by))
|
|
53
|
+
query = f"""
|
|
54
|
+
SELECT
|
|
55
|
+
{group_by_columns},
|
|
56
|
+
FSUM({var}) AS {var},
|
|
57
|
+
LIST({id}) AS {id}
|
|
58
|
+
FROM 'df'
|
|
59
|
+
GROUP BY {group_by_columns}
|
|
60
|
+
"""
|
|
61
|
+
# Execute the query
|
|
62
|
+
with duckdb.connect() as con:
|
|
63
|
+
df_agg = con.execute(query).fetchdf()
|
|
64
|
+
return df_agg
|
|
65
|
+
|
|
66
|
+
def aggregate_and_list(df:pd.DataFrame,
|
|
67
|
+
by, var=None,
|
|
68
|
+
margins=None,
|
|
69
|
+
id=None):
|
|
70
|
+
"""
|
|
71
|
+
Aggregate a DataFrame across all possible subsets of specified columns.
|
|
72
|
+
|
|
73
|
+
This function generates combinations of the 'by' columns, performs
|
|
74
|
+
aggregations for each subset, and concatenates the results into a
|
|
75
|
+
single DataFrame.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
df (pd.DataFrame): The source data to be aggregated.
|
|
79
|
+
by (str or list): Column name(s) to group by and create combinations from.
|
|
80
|
+
var (str, optional): The variable/column name to be aggregated.
|
|
81
|
+
Defaults to None.
|
|
82
|
+
margins (list of lists, optional): Specific subsets of 'by' to include.
|
|
83
|
+
If provided, only these combinations are processed. Defaults to None.
|
|
84
|
+
id (str, optional): An identifier for the aggregation operation.
|
|
85
|
+
Defaults to None.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
pd.DataFrame: A concatenated DataFrame containing all subset aggregations.
|
|
89
|
+
"""
|
|
90
|
+
if by is not None and not isinstance(by,list):
|
|
91
|
+
by = [by]
|
|
92
|
+
|
|
93
|
+
subsets=[]
|
|
94
|
+
if by is not None:
|
|
95
|
+
for i in range(0,len(by)):
|
|
96
|
+
comb = combinations(by,i)
|
|
97
|
+
subsets = subsets + [list(c) for c in comb]
|
|
98
|
+
else:
|
|
99
|
+
subsets=[[]]
|
|
100
|
+
|
|
101
|
+
if margins is not None:
|
|
102
|
+
subsets = [sub for sub in subsets if sub in margins]
|
|
103
|
+
|
|
104
|
+
df_out = pd.DataFrame()
|
|
105
|
+
for sub in subsets:
|
|
106
|
+
sub_agg = agg_by_sql(df, by=sub, var=var, id=id)
|
|
107
|
+
df_out = pd.concat([df_out,sub_agg],ignore_index=True)
|
|
108
|
+
return df_out
|
|
109
|
+
|
|
110
|
+
def aggregate_table(df_in,
|
|
111
|
+
by=None,
|
|
112
|
+
var=None,
|
|
113
|
+
margins=None):
|
|
114
|
+
"""
|
|
115
|
+
aggreagate the input table into a table of the form
|
|
116
|
+
|
|
117
|
+
output:
|
|
118
|
+
Table
|
|
119
|
+
unit_id : identifier for the decision variables
|
|
120
|
+
weight : decision variables. >=0
|
|
121
|
+
lb : weight >= lb
|
|
122
|
+
ub : weight <= up
|
|
123
|
+
|
|
124
|
+
Table
|
|
125
|
+
unit_id : identifier for the decision variables
|
|
126
|
+
cons_id : identifiant des contraintes
|
|
127
|
+
|
|
128
|
+
"""
|
|
129
|
+
# aggregate "var" by "by" columns
|
|
130
|
+
# in case there are duplicates in the input to make sure we have a table with signle entries per cell
|
|
131
|
+
by_values = df_in.groupby(by)[var].sum().reset_index()
|
|
132
|
+
|
|
133
|
+
# get a unique name not already present in the dataframe to store cell identifier
|
|
134
|
+
cell_id_name = get_unique_col_name(by_values,"unit_id")
|
|
135
|
+
|
|
136
|
+
# create a unique identifer for each cell of the table
|
|
137
|
+
by_values[cell_id_name] = range(len(by_values))
|
|
138
|
+
cell_id_lst = list(by_values[cell_id_name])
|
|
139
|
+
n_cells = len(cell_id_lst)
|
|
140
|
+
|
|
141
|
+
# get margins of the input table
|
|
142
|
+
df_margins = aggregate_and_list(by_values, by, var, margins, cell_id_name)
|
|
143
|
+
cons_id_name = get_unique_col_name(df_margins,"cons_id")
|
|
144
|
+
df_margins[cons_id_name] = range(len(df_margins))
|
|
145
|
+
n_margins = len(df_margins)
|
|
146
|
+
|
|
147
|
+
# create a mapping of each margin identifer to a list of each cell identifer adding up to it
|
|
148
|
+
constraints = df_margins.explode(cell_id_name).reset_index(drop=True)
|
|
149
|
+
|
|
150
|
+
return by_values, df_margins.drop([cell_id_name],axis=1), constraints[[cell_id_name,cons_id_name]]
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def get_discrepancy(con):
|
|
154
|
+
"""
|
|
155
|
+
returns the discrepancies between then aggregated margins and their target values
|
|
156
|
+
|
|
157
|
+
input: from the database connection con
|
|
158
|
+
table wrk_weights
|
|
159
|
+
table wrk_input_constraints
|
|
160
|
+
table wrk_input_targets
|
|
161
|
+
|
|
162
|
+
output: in the database connection con
|
|
163
|
+
table wrk_discrepancies
|
|
164
|
+
output:
|
|
165
|
+
value max_discrepancy
|
|
166
|
+
|
|
167
|
+
"""
|
|
168
|
+
con.execute("""
|
|
169
|
+
CREATE OR REPLACE TABLE wrk_constraints AS
|
|
170
|
+
SELECT a.cons_id, fsum(b.weight) as aggregated_weight_per_constraint
|
|
171
|
+
FROM wrk_input_constraints AS a
|
|
172
|
+
LEFT JOIN wrk_weights AS b
|
|
173
|
+
ON a.unit_id=b.unit_id
|
|
174
|
+
GROUP by a.cons_id
|
|
175
|
+
;
|
|
176
|
+
""")
|
|
177
|
+
con.execute("""
|
|
178
|
+
CREATE OR REPLACE TABLE wrk_discrepancies AS
|
|
179
|
+
SELECT a.cons_id, a.cons_type, a.target, b.aggregated_weight_per_constraint as target_approximation
|
|
180
|
+
FROM wrk_input_targets AS a
|
|
181
|
+
LEFT JOIN wrk_constraints AS b
|
|
182
|
+
ON a.cons_id = b.cons_id
|
|
183
|
+
;
|
|
184
|
+
""")
|
|
185
|
+
con.execute("""
|
|
186
|
+
CREATE OR REPLACE TABLE wrk_discrepancies AS
|
|
187
|
+
SELECT *,
|
|
188
|
+
-- Step 1: Compute diff and adjustement
|
|
189
|
+
target - target_approximation AS diff,
|
|
190
|
+
target / target_approximation AS adjustement
|
|
191
|
+
FROM wrk_discrepancies;
|
|
192
|
+
|
|
193
|
+
-- Step 2: Apply constraints on adjustement and diff
|
|
194
|
+
UPDATE wrk_discrepancies
|
|
195
|
+
SET adjustement = CASE
|
|
196
|
+
WHEN cons_type = 'le' AND adjustement > 1 THEN 1
|
|
197
|
+
WHEN cons_type = 'ge' AND adjustement < 1 THEN 1
|
|
198
|
+
ELSE adjustement
|
|
199
|
+
END,
|
|
200
|
+
diff = CASE
|
|
201
|
+
WHEN cons_type = 'le' AND adjustement > 1 THEN 0
|
|
202
|
+
WHEN cons_type = 'ge' AND adjustement < 1 THEN 0
|
|
203
|
+
ELSE diff
|
|
204
|
+
END;
|
|
205
|
+
;
|
|
206
|
+
""")
|
|
207
|
+
max_discrepancy = con.execute("SELECT max(abs(diff)) FROM wrk_discrepancies ;").fetchone()[0]
|
|
208
|
+
return max_discrepancy
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def timer(func):
|
|
212
|
+
"""
|
|
213
|
+
Decorator that return the duration of a function execution
|
|
214
|
+
"""
|
|
215
|
+
@functools.wraps(func)
|
|
216
|
+
def wrapper_timer(*args, **kwargs):
|
|
217
|
+
tic = perf_counter()
|
|
218
|
+
value = func(*args, **kwargs)
|
|
219
|
+
toc = perf_counter()
|
|
220
|
+
elapsed_time = toc - tic
|
|
221
|
+
print(f"Elapsed time: {elapsed_time:0.4f} seconds")
|
|
222
|
+
return value
|
|
223
|
+
return wrapper_timer
|
|
224
|
+
|
|
225
|
+
@timer
|
|
226
|
+
def ipf(input=None,
|
|
227
|
+
constraints=None,
|
|
228
|
+
targets=None,
|
|
229
|
+
unit_id="unit_id",
|
|
230
|
+
var="weight",
|
|
231
|
+
cons_id="cons_id",
|
|
232
|
+
lb=None,
|
|
233
|
+
ub=None,
|
|
234
|
+
cons_type=None,
|
|
235
|
+
db_file=":memory:",
|
|
236
|
+
tol=1,
|
|
237
|
+
max_iter=100,
|
|
238
|
+
out_parquet=None,
|
|
239
|
+
out_csv=None,
|
|
240
|
+
silent=False):
|
|
241
|
+
"""
|
|
242
|
+
input: table
|
|
243
|
+
This table lists all the cells or units in a table whose value will be adjusted by Iterative proportional fitting
|
|
244
|
+
along with boundaries whose adjusted value is meant to stay within.
|
|
245
|
+
unit_id : identifier for the decision variables
|
|
246
|
+
weight : decision variables. >=0
|
|
247
|
+
lb : weight >= lb
|
|
248
|
+
ub : weight <= up
|
|
249
|
+
|
|
250
|
+
constraints : table
|
|
251
|
+
This table maps for each constaint identifier, which unit_id to aggregate
|
|
252
|
+
unit_id : identifier for the decision variables
|
|
253
|
+
cons_id : identifier for each margin
|
|
254
|
+
|
|
255
|
+
targets : table
|
|
256
|
+
This table lists all the target values that the margins should add up to once adjusted
|
|
257
|
+
cons_id : identifier for each margin
|
|
258
|
+
cons_type : constraint must be greater or equal (ge) the target, lesser or equal (le), or equal (eq)
|
|
259
|
+
target : value for the constaint
|
|
260
|
+
|
|
261
|
+
db_file (optional ): name of the database file that will hold the temporary tables
|
|
262
|
+
|
|
263
|
+
out_parquet (optional): name path of the parquet output file
|
|
264
|
+
out_csv (optional) : name path of the csv output file
|
|
265
|
+
|
|
266
|
+
silent (optinal default false): Whether or not to print progress to screen
|
|
267
|
+
|
|
268
|
+
output : table
|
|
269
|
+
Output table lists all the initials cells/units along with their adjusted values.
|
|
270
|
+
untiId : identifier for the decision variables
|
|
271
|
+
weight : adjusted weight. Will fit in the interval lb <= weight <= ub
|
|
272
|
+
|
|
273
|
+
"""
|
|
274
|
+
if not silent:
|
|
275
|
+
print()
|
|
276
|
+
print("-----------")
|
|
277
|
+
print("Calibration")
|
|
278
|
+
print("-----------")
|
|
279
|
+
print()
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
with duckdb.connect(db_file) as con:
|
|
283
|
+
# 1. Register the source as a virtual table named 'input_table'
|
|
284
|
+
if isinstance(input, pd.DataFrame):
|
|
285
|
+
con.register('input_table', input)
|
|
286
|
+
elif isinstance(input, str):
|
|
287
|
+
# DuckDB's read_auto handles CSV, Parquet, or JSON automatically
|
|
288
|
+
con.execute(f"CREATE OR REPLACE VIEW input_table AS SELECT * FROM '{input}'")
|
|
289
|
+
|
|
290
|
+
if isinstance(constraints, pd.DataFrame):
|
|
291
|
+
con.register('constraints_table', constraints)
|
|
292
|
+
elif isinstance(constraints, str):
|
|
293
|
+
# DuckDB's read_auto handles CSV, Parquet, or JSON automatically
|
|
294
|
+
con.execute(f"CREATE OR REPLACE VIEW constraints_table AS SELECT * FROM '{constraints}'")
|
|
295
|
+
|
|
296
|
+
if isinstance(targets, pd.DataFrame):
|
|
297
|
+
con.register('targets_table', targets)
|
|
298
|
+
elif isinstance(constraints, str):
|
|
299
|
+
# DuckDB's read_auto handles CSV, Parquet, or JSON automatically
|
|
300
|
+
con.execute(f"CREATE OR REPLACE VIEW targets_table AS SELECT * FROM '{targets}'")
|
|
301
|
+
|
|
302
|
+
# Collect the values from dataset targets
|
|
303
|
+
n_units = con.execute("SELECT COUNT(DISTINCT unit_id ) FROM input_table;").fetchone()[0]
|
|
304
|
+
n_var = con.execute("SELECT COUNT(DISTINCT cons_id ) FROM constraints_table;").fetchone()[0]
|
|
305
|
+
|
|
306
|
+
if not silent:
|
|
307
|
+
print(f"Number of equations: {n_var}")
|
|
308
|
+
print(f"Number of units : {n_units}")
|
|
309
|
+
print()
|
|
310
|
+
|
|
311
|
+
# set up the working table of weights to be adjusted
|
|
312
|
+
sql_select = f"SELECT {unit_id} as unit_id, {var} as weight"
|
|
313
|
+
if lb:
|
|
314
|
+
sql_select += ", lb"
|
|
315
|
+
if ub:
|
|
316
|
+
sql_select += ", ub"
|
|
317
|
+
|
|
318
|
+
con.execute(f"""
|
|
319
|
+
CREATE TABLE wrk_weights AS
|
|
320
|
+
{sql_select}
|
|
321
|
+
FROM input_table
|
|
322
|
+
""")
|
|
323
|
+
|
|
324
|
+
# read in the constraints
|
|
325
|
+
con.execute("""
|
|
326
|
+
CREATE TABLE wrk_input_constraints AS
|
|
327
|
+
SELECT unit_id, cons_id
|
|
328
|
+
FROM constraints_table
|
|
329
|
+
""")
|
|
330
|
+
|
|
331
|
+
# read in the target values for the constraints
|
|
332
|
+
sql_select = "SELECT cons_id, 'eq' as cons_type, target"
|
|
333
|
+
if cons_type:
|
|
334
|
+
sql_select = f"SELECT cons_id, {cons_type}, target"
|
|
335
|
+
con.execute(f"""
|
|
336
|
+
CREATE TABLE wrk_input_targets AS
|
|
337
|
+
{sql_select}
|
|
338
|
+
FROM targets_table
|
|
339
|
+
""")
|
|
340
|
+
|
|
341
|
+
# get the initial state of adjustment between the margins and the target margins
|
|
342
|
+
max_discrepancy = tol
|
|
343
|
+
max_discrepancy = get_discrepancy(con)
|
|
344
|
+
if not silent:
|
|
345
|
+
print(f"Initial max discrepancy : {max_discrepancy} ")
|
|
346
|
+
|
|
347
|
+
n_iter = 0
|
|
348
|
+
while ( ( (max_discrepancy >= tol) and (n_iter <= max_iter) ) ):
|
|
349
|
+
# for each unit_id, fetch the adjustment required by the constraint
|
|
350
|
+
con.execute(f"""
|
|
351
|
+
CREATE OR REPLACE TABLE wrk_constraints as
|
|
352
|
+
SELECT a.*, b.adjustement
|
|
353
|
+
FROM wrk_input_constraints as a
|
|
354
|
+
LEFT JOIN wrk_discrepancies as b
|
|
355
|
+
ON a.cons_id = b.cons_id
|
|
356
|
+
;
|
|
357
|
+
""")
|
|
358
|
+
# compute the geometric mean of the adjustements to be made
|
|
359
|
+
con.execute(f"""
|
|
360
|
+
CREATE OR REPLACE TABLE wrk_unit_adjustement AS
|
|
361
|
+
SELECT unit_id, exp(mean(log(adjustement))) as adjust
|
|
362
|
+
FROM wrk_constraints
|
|
363
|
+
GROUP BY unit_id
|
|
364
|
+
""")
|
|
365
|
+
# adjust the weights
|
|
366
|
+
con.execute(f"""
|
|
367
|
+
CREATE OR REPLACE TABLE wrk_weights AS
|
|
368
|
+
SELECT a.* EXCLUDE weight, a.weight*b.adjust as weight
|
|
369
|
+
FROM wrk_weights as a
|
|
370
|
+
LEFT JOIN wrk_unit_adjustement as b
|
|
371
|
+
ON a.unit_id = b.unit_id
|
|
372
|
+
""")
|
|
373
|
+
# make sure the values are within bounds*/
|
|
374
|
+
if lb :
|
|
375
|
+
con.execute("""
|
|
376
|
+
CREATE OR REPLACE TABLE wrk_weights AS
|
|
377
|
+
SELECT *, GREATEST(weight, lb) AS weight
|
|
378
|
+
FROM wrk_weights
|
|
379
|
+
EXCLUDE weight_;
|
|
380
|
+
""")
|
|
381
|
+
if ub:
|
|
382
|
+
con.execute("""
|
|
383
|
+
CREATE OR REPLACE TABLE wrk_weights AS
|
|
384
|
+
SELECT *, LEAST(weight, ub) AS weight
|
|
385
|
+
FROM wrk_weights
|
|
386
|
+
EXCLUDE weight_;
|
|
387
|
+
""")
|
|
388
|
+
|
|
389
|
+
max_discrepancy = get_discrepancy(con)
|
|
390
|
+
|
|
391
|
+
if not silent:
|
|
392
|
+
print(f"iteration {n_iter} : {max_discrepancy}")
|
|
393
|
+
n_iter += 1
|
|
394
|
+
|
|
395
|
+
if out_parquet:
|
|
396
|
+
con.execute(f"COPY wrk_weights TO '{out_parquet}' (FORMAT PARQUET);")
|
|
397
|
+
if out_csv:
|
|
398
|
+
con.execute(f"COPY wrk_weights TO '{out_csv}' (HEADER, DELIMITER ',');")
|
|
399
|
+
|
|
400
|
+
if not (out_parquet or out_csv):
|
|
401
|
+
return con.execute("SELECT * FROM wrk_weights").fetchdf()
|
|
402
|
+
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: IPFpy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Performs iterative proportional fitting on tabular data
|
|
5
|
+
Home-page: https://github.com/veozen/IPF
|
|
6
|
+
Author: Christian Gagné
|
|
7
|
+
Author-email: christian.gagne@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: numpy>=1.23.5
|
|
14
|
+
Requires-Dist: pandas>=2.1.2
|
|
15
|
+
Requires-Dist: duckdb>=1.4.0
|
|
16
|
+
Dynamic: author
|
|
17
|
+
Dynamic: author-email
|
|
18
|
+
Dynamic: classifier
|
|
19
|
+
Dynamic: description
|
|
20
|
+
Dynamic: description-content-type
|
|
21
|
+
Dynamic: home-page
|
|
22
|
+
Dynamic: requires-dist
|
|
23
|
+
Dynamic: requires-python
|
|
24
|
+
Dynamic: summary
|
|
25
|
+
|
|
26
|
+
# IPFpy
|
|
27
|
+
Iterative proportionial fitting that can work with larger than memory tables.
|
|
28
|
+
|
|
29
|
+
inputs tables can be either pandas dataframes, .csv file or .parquet file
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
input: table
|
|
33
|
+
Thif table lists all the cells or units in a table whose value will be adjusted by Iterative proportional fitting along with boundaries whose adjusted value is meant to stay within.
|
|
34
|
+
unit_id : identifier for the decision variables
|
|
35
|
+
weight : decision variables. >=0
|
|
36
|
+
lb : weight >= lb
|
|
37
|
+
ub : weight <= up
|
|
38
|
+
|
|
39
|
+
constraints : table
|
|
40
|
+
This table maps for each constaint identifier, which unit_id to aggregate
|
|
41
|
+
unit_id : identifier for the decision variables
|
|
42
|
+
cons_id : identifier for each marging
|
|
43
|
+
|
|
44
|
+
targets : table
|
|
45
|
+
This table lists all the target values that the margins should add up to once adjusted
|
|
46
|
+
cons_id : identifier for each marging
|
|
47
|
+
cons_type : constraint must be greater or equal (ge) the target, lesser or equal (le), or equal (eq)
|
|
48
|
+
target : value for the constaint
|
|
49
|
+
|
|
50
|
+
unit_id : name of the column that identifies each value to be adjusted (default "unit_id")
|
|
51
|
+
var : name of the column that contains the value to be adjusted (default "weight")
|
|
52
|
+
cons_id : name of the column that identifies each constraints (default "cons_id")
|
|
53
|
+
|
|
54
|
+
db_file (optional ): name of the database file on disc that will hold the temporary tables. Default is in memory
|
|
55
|
+
|
|
56
|
+
out_parquet (optional): name path of the parquet output file
|
|
57
|
+
out_csv (optional) : name path of the csv output file
|
|
58
|
+
|
|
59
|
+
silent (optinal default false): Whether or not to print progress to screen
|
|
60
|
+
|
|
61
|
+
output : table
|
|
62
|
+
Output table lists all the initials cells/units along with their adjusted values.
|
|
63
|
+
untiId : identifier for the decision variables
|
|
64
|
+
weight : adjusted weight. Will fit in the interval lb <= weight <= ub
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Example
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from IPFpy import *
|
|
72
|
+
import numpy as np
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# test IPF
|
|
76
|
+
#step1 - create a table and generate the margins as well as the file that maps the cells of the inner table to the margins
|
|
77
|
+
raw_table = generate_random_table(4,8,scale=2)
|
|
78
|
+
input_table, margins, constraints = aggregate_table(raw_table, by=[0,1,2,3], var="value")
|
|
79
|
+
margins = margins.rename(columns={"value":"target"}) #rename margin column
|
|
80
|
+
|
|
81
|
+
# step2 - modify the margins by adding noise to the inner cells
|
|
82
|
+
new_table = input_table.copy().drop("unit_id",axis=1)
|
|
83
|
+
new_table["value"] = input_table["value"] * np.random.uniform(0, 2, input_table.shape[0])
|
|
84
|
+
modified_table, modified_margins, constraints = aggregate_table(new_table, by=[0,1,2,3], var="value")
|
|
85
|
+
modified_margins = modified_margins.rename(columns={"value":"target"})
|
|
86
|
+
|
|
87
|
+
# write table as csv
|
|
88
|
+
input_table.to_csv('input_table.csv', index=False)
|
|
89
|
+
constraints.to_csv('constraints.csv', index=False)
|
|
90
|
+
modified_margins.to_csv('modified_margins.csv', index=False)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
df.to_parquet('my_data.parquet', engine='pyarrow')
|
|
94
|
+
|
|
95
|
+
# adjust the table in step1 to the margin obtained in step2
|
|
96
|
+
adjusted_table = ipf( input=input_table,
|
|
97
|
+
constraints=constraints,
|
|
98
|
+
targets=modified_margins,
|
|
99
|
+
unit_id="unit_id",
|
|
100
|
+
var="value",
|
|
101
|
+
cons_id="cons_id",
|
|
102
|
+
db_file=None,
|
|
103
|
+
tol=0.1,
|
|
104
|
+
maxIter=1000)
|
|
105
|
+
|
|
106
|
+
# output to a file
|
|
107
|
+
ipf(input =input_table,
|
|
108
|
+
constraints =constraints,
|
|
109
|
+
targets =modified_margins,
|
|
110
|
+
unit_id ="unit_id",
|
|
111
|
+
var ="value",
|
|
112
|
+
cons_id ="cons_id",
|
|
113
|
+
tol =0.1,
|
|
114
|
+
maxIter =1000,
|
|
115
|
+
out_csv ="adjusted_table.csv",
|
|
116
|
+
silent=True)
|
|
117
|
+
|
|
118
|
+
# input directly from files
|
|
119
|
+
# paths to the input files have to be adjusted to correspond to the location of the input files
|
|
120
|
+
ipf(input ="/home/Desktop/Programming/IPF/IPF/input_table.csv",
|
|
121
|
+
constraints ="/home/Desktop/Programming/IPF/IPF/constraints.csv",
|
|
122
|
+
targets ="/home/Desktop/Programming/IPF/IPF/modified_margins.csv",
|
|
123
|
+
unit_id ="unit_id",
|
|
124
|
+
var ="value",
|
|
125
|
+
cons_id ="cons_id",
|
|
126
|
+
tol =0.1,
|
|
127
|
+
maxIter =1000,
|
|
128
|
+
out_csv ="adjusted_table.csv",
|
|
129
|
+
silent=True)
|
|
130
|
+
|
|
131
|
+
```
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
IPFpy
|
ipfpy-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: IPFpy
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Performs iterative proportional fitting on tabular data
|
|
5
|
+
Home-page: https://github.com/veozen/IPF
|
|
6
|
+
Author: Christian Gagné
|
|
7
|
+
Author-email: christian.gagne@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: numpy>=1.23.5
|
|
14
|
+
Requires-Dist: pandas>=2.1.2
|
|
15
|
+
Requires-Dist: duckdb>=1.4.0
|
|
16
|
+
Dynamic: author
|
|
17
|
+
Dynamic: author-email
|
|
18
|
+
Dynamic: classifier
|
|
19
|
+
Dynamic: description
|
|
20
|
+
Dynamic: description-content-type
|
|
21
|
+
Dynamic: home-page
|
|
22
|
+
Dynamic: requires-dist
|
|
23
|
+
Dynamic: requires-python
|
|
24
|
+
Dynamic: summary
|
|
25
|
+
|
|
26
|
+
# IPFpy
|
|
27
|
+
Iterative proportionial fitting that can work with larger than memory tables.
|
|
28
|
+
|
|
29
|
+
inputs tables can be either pandas dataframes, .csv file or .parquet file
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
input: table
|
|
33
|
+
Thif table lists all the cells or units in a table whose value will be adjusted by Iterative proportional fitting along with boundaries whose adjusted value is meant to stay within.
|
|
34
|
+
unit_id : identifier for the decision variables
|
|
35
|
+
weight : decision variables. >=0
|
|
36
|
+
lb : weight >= lb
|
|
37
|
+
ub : weight <= up
|
|
38
|
+
|
|
39
|
+
constraints : table
|
|
40
|
+
This table maps for each constaint identifier, which unit_id to aggregate
|
|
41
|
+
unit_id : identifier for the decision variables
|
|
42
|
+
cons_id : identifier for each marging
|
|
43
|
+
|
|
44
|
+
targets : table
|
|
45
|
+
This table lists all the target values that the margins should add up to once adjusted
|
|
46
|
+
cons_id : identifier for each marging
|
|
47
|
+
cons_type : constraint must be greater or equal (ge) the target, lesser or equal (le), or equal (eq)
|
|
48
|
+
target : value for the constaint
|
|
49
|
+
|
|
50
|
+
unit_id : name of the column that identifies each value to be adjusted (default "unit_id")
|
|
51
|
+
var : name of the column that contains the value to be adjusted (default "weight")
|
|
52
|
+
cons_id : name of the column that identifies each constraints (default "cons_id")
|
|
53
|
+
|
|
54
|
+
db_file (optional ): name of the database file on disc that will hold the temporary tables. Default is in memory
|
|
55
|
+
|
|
56
|
+
out_parquet (optional): name path of the parquet output file
|
|
57
|
+
out_csv (optional) : name path of the csv output file
|
|
58
|
+
|
|
59
|
+
silent (optinal default false): Whether or not to print progress to screen
|
|
60
|
+
|
|
61
|
+
output : table
|
|
62
|
+
Output table lists all the initials cells/units along with their adjusted values.
|
|
63
|
+
untiId : identifier for the decision variables
|
|
64
|
+
weight : adjusted weight. Will fit in the interval lb <= weight <= ub
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Example
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from IPFpy import *
|
|
72
|
+
import numpy as np
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# test IPF
|
|
76
|
+
#step1 - create a table and generate the margins as well as the file that maps the cells of the inner table to the margins
|
|
77
|
+
raw_table = generate_random_table(4,8,scale=2)
|
|
78
|
+
input_table, margins, constraints = aggregate_table(raw_table, by=[0,1,2,3], var="value")
|
|
79
|
+
margins = margins.rename(columns={"value":"target"}) #rename margin column
|
|
80
|
+
|
|
81
|
+
# step2 - modify the margins by adding noise to the inner cells
|
|
82
|
+
new_table = input_table.copy().drop("unit_id",axis=1)
|
|
83
|
+
new_table["value"] = input_table["value"] * np.random.uniform(0, 2, input_table.shape[0])
|
|
84
|
+
modified_table, modified_margins, constraints = aggregate_table(new_table, by=[0,1,2,3], var="value")
|
|
85
|
+
modified_margins = modified_margins.rename(columns={"value":"target"})
|
|
86
|
+
|
|
87
|
+
# write table as csv
|
|
88
|
+
input_table.to_csv('input_table.csv', index=False)
|
|
89
|
+
constraints.to_csv('constraints.csv', index=False)
|
|
90
|
+
modified_margins.to_csv('modified_margins.csv', index=False)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
df.to_parquet('my_data.parquet', engine='pyarrow')
|
|
94
|
+
|
|
95
|
+
# adjust the table in step1 to the margin obtained in step2
|
|
96
|
+
adjusted_table = ipf( input=input_table,
|
|
97
|
+
constraints=constraints,
|
|
98
|
+
targets=modified_margins,
|
|
99
|
+
unit_id="unit_id",
|
|
100
|
+
var="value",
|
|
101
|
+
cons_id="cons_id",
|
|
102
|
+
db_file=None,
|
|
103
|
+
tol=0.1,
|
|
104
|
+
maxIter=1000)
|
|
105
|
+
|
|
106
|
+
# output to a file
|
|
107
|
+
ipf(input =input_table,
|
|
108
|
+
constraints =constraints,
|
|
109
|
+
targets =modified_margins,
|
|
110
|
+
unit_id ="unit_id",
|
|
111
|
+
var ="value",
|
|
112
|
+
cons_id ="cons_id",
|
|
113
|
+
tol =0.1,
|
|
114
|
+
maxIter =1000,
|
|
115
|
+
out_csv ="adjusted_table.csv",
|
|
116
|
+
silent=True)
|
|
117
|
+
|
|
118
|
+
# input directly from files
|
|
119
|
+
# paths to the input files have to be adjusted to correspond to the location of the input files
|
|
120
|
+
ipf(input ="/home/Desktop/Programming/IPF/IPF/input_table.csv",
|
|
121
|
+
constraints ="/home/Desktop/Programming/IPF/IPF/constraints.csv",
|
|
122
|
+
targets ="/home/Desktop/Programming/IPF/IPF/modified_margins.csv",
|
|
123
|
+
unit_id ="unit_id",
|
|
124
|
+
var ="value",
|
|
125
|
+
cons_id ="cons_id",
|
|
126
|
+
tol =0.1,
|
|
127
|
+
maxIter =1000,
|
|
128
|
+
out_csv ="adjusted_table.csv",
|
|
129
|
+
silent=True)
|
|
130
|
+
|
|
131
|
+
```
|
ipfpy-0.1.0/README.md
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# IPFpy
|
|
2
|
+
Iterative proportionial fitting that can work with larger than memory tables.
|
|
3
|
+
|
|
4
|
+
inputs tables can be either pandas dataframes, .csv file or .parquet file
|
|
5
|
+
|
|
6
|
+
```
|
|
7
|
+
input: table
|
|
8
|
+
Thif table lists all the cells or units in a table whose value will be adjusted by Iterative proportional fitting along with boundaries whose adjusted value is meant to stay within.
|
|
9
|
+
unit_id : identifier for the decision variables
|
|
10
|
+
weight : decision variables. >=0
|
|
11
|
+
lb : weight >= lb
|
|
12
|
+
ub : weight <= up
|
|
13
|
+
|
|
14
|
+
constraints : table
|
|
15
|
+
This table maps for each constaint identifier, which unit_id to aggregate
|
|
16
|
+
unit_id : identifier for the decision variables
|
|
17
|
+
cons_id : identifier for each marging
|
|
18
|
+
|
|
19
|
+
targets : table
|
|
20
|
+
This table lists all the target values that the margins should add up to once adjusted
|
|
21
|
+
cons_id : identifier for each marging
|
|
22
|
+
cons_type : constraint must be greater or equal (ge) the target, lesser or equal (le), or equal (eq)
|
|
23
|
+
target : value for the constaint
|
|
24
|
+
|
|
25
|
+
unit_id : name of the column that identifies each value to be adjusted (default "unit_id")
|
|
26
|
+
var : name of the column that contains the value to be adjusted (default "weight")
|
|
27
|
+
cons_id : name of the column that identifies each constraints (default "cons_id")
|
|
28
|
+
|
|
29
|
+
db_file (optional ): name of the database file on disc that will hold the temporary tables. Default is in memory
|
|
30
|
+
|
|
31
|
+
out_parquet (optional): name path of the parquet output file
|
|
32
|
+
out_csv (optional) : name path of the csv output file
|
|
33
|
+
|
|
34
|
+
silent (optinal default false): Whether or not to print progress to screen
|
|
35
|
+
|
|
36
|
+
output : table
|
|
37
|
+
Output table lists all the initials cells/units along with their adjusted values.
|
|
38
|
+
untiId : identifier for the decision variables
|
|
39
|
+
weight : adjusted weight. Will fit in the interval lb <= weight <= ub
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Example
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from IPFpy import *
|
|
47
|
+
import numpy as np
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# test IPF
|
|
51
|
+
#step1 - create a table and generate the margins as well as the file that maps the cells of the inner table to the margins
|
|
52
|
+
raw_table = generate_random_table(4,8,scale=2)
|
|
53
|
+
input_table, margins, constraints = aggregate_table(raw_table, by=[0,1,2,3], var="value")
|
|
54
|
+
margins = margins.rename(columns={"value":"target"}) #rename margin column
|
|
55
|
+
|
|
56
|
+
# step2 - modify the margins by adding noise to the inner cells
|
|
57
|
+
new_table = input_table.copy().drop("unit_id",axis=1)
|
|
58
|
+
new_table["value"] = input_table["value"] * np.random.uniform(0, 2, input_table.shape[0])
|
|
59
|
+
modified_table, modified_margins, constraints = aggregate_table(new_table, by=[0,1,2,3], var="value")
|
|
60
|
+
modified_margins = modified_margins.rename(columns={"value":"target"})
|
|
61
|
+
|
|
62
|
+
# write table as csv
|
|
63
|
+
input_table.to_csv('input_table.csv', index=False)
|
|
64
|
+
constraints.to_csv('constraints.csv', index=False)
|
|
65
|
+
modified_margins.to_csv('modified_margins.csv', index=False)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
df.to_parquet('my_data.parquet', engine='pyarrow')
|
|
69
|
+
|
|
70
|
+
# adjust the table in step1 to the margin obtained in step2
|
|
71
|
+
adjusted_table = ipf( input=input_table,
|
|
72
|
+
constraints=constraints,
|
|
73
|
+
targets=modified_margins,
|
|
74
|
+
unit_id="unit_id",
|
|
75
|
+
var="value",
|
|
76
|
+
cons_id="cons_id",
|
|
77
|
+
db_file=None,
|
|
78
|
+
tol=0.1,
|
|
79
|
+
maxIter=1000)
|
|
80
|
+
|
|
81
|
+
# output to a file
|
|
82
|
+
ipf(input =input_table,
|
|
83
|
+
constraints =constraints,
|
|
84
|
+
targets =modified_margins,
|
|
85
|
+
unit_id ="unit_id",
|
|
86
|
+
var ="value",
|
|
87
|
+
cons_id ="cons_id",
|
|
88
|
+
tol =0.1,
|
|
89
|
+
maxIter =1000,
|
|
90
|
+
out_csv ="adjusted_table.csv",
|
|
91
|
+
silent=True)
|
|
92
|
+
|
|
93
|
+
# input directly from files
|
|
94
|
+
# paths to the input files have to be adjusted to correspond to the location of the input files
|
|
95
|
+
ipf(input ="/home/Desktop/Programming/IPF/IPF/input_table.csv",
|
|
96
|
+
constraints ="/home/Desktop/Programming/IPF/IPF/constraints.csv",
|
|
97
|
+
targets ="/home/Desktop/Programming/IPF/IPF/modified_margins.csv",
|
|
98
|
+
unit_id ="unit_id",
|
|
99
|
+
var ="value",
|
|
100
|
+
cons_id ="cons_id",
|
|
101
|
+
tol =0.1,
|
|
102
|
+
maxIter =1000,
|
|
103
|
+
out_csv ="adjusted_table.csv",
|
|
104
|
+
silent=True)
|
|
105
|
+
|
|
106
|
+
```
|
ipfpy-0.1.0/setup.cfg
ADDED
ipfpy-0.1.0/setup.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Setup configuration for the IPFpy package.
|
|
3
|
+
|
|
4
|
+
This script handles the packaging, dependency management, and metadata
|
|
5
|
+
required to distribute IPFpy via PyPI.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from setuptools import setup, find_packages
|
|
9
|
+
|
|
10
|
+
setup(
|
|
11
|
+
name='IPFpy',
|
|
12
|
+
version='0.1.0',
|
|
13
|
+
packages=find_packages(),
|
|
14
|
+
install_requires=[
|
|
15
|
+
'numpy>=1.23.5',
|
|
16
|
+
'pandas>=2.1.2',
|
|
17
|
+
'duckdb>=1.4.0',
|
|
18
|
+
],
|
|
19
|
+
author='Christian Gagné',
|
|
20
|
+
author_email='christian.gagne@gmail.com',
|
|
21
|
+
description='Performs iterative proportional fitting on tabular data',
|
|
22
|
+
long_description=open('README.md').read(),
|
|
23
|
+
long_description_content_type='text/markdown',
|
|
24
|
+
url='https://github.com/veozen/IPF',
|
|
25
|
+
classifiers=[
|
|
26
|
+
'Programming Language :: Python :: 3',
|
|
27
|
+
'License :: OSI Approved :: MIT License',
|
|
28
|
+
'Operating System :: OS Independent',
|
|
29
|
+
],
|
|
30
|
+
python_requires='>=3.10',
|
|
31
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for IPFpy package
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import numpy as np
|
|
8
|
+
from IPFpy import ipf, generate_random_table, aggregate_table
|
|
9
|
+
|
|
10
|
+
def test_ipf_basic_execution():
|
|
11
|
+
"""
|
|
12
|
+
Tests that ipf produces a result for a random 4 dimensional table where each dimension has 8 categories
|
|
13
|
+
"""
|
|
14
|
+
#step1 - create a table and generate the margins as well as the file that maps the cells of the inner table to the margins
|
|
15
|
+
raw_table = generate_random_table(4,8,scale=2)
|
|
16
|
+
input_table, margins, constraints = aggregate_table(raw_table, by=[0,1,2,3], var="value")
|
|
17
|
+
margins = margins.rename(columns={"value":"target"}) #rename margin column
|
|
18
|
+
|
|
19
|
+
# step2 - modify the margins by adding noise to the inner cells
|
|
20
|
+
new_table = input_table.copy().drop("unit_id",axis=1)
|
|
21
|
+
new_table["value"] = input_table["value"] * np.random.uniform(0, 2, input_table.shape[0])
|
|
22
|
+
_, modified_margins, constraints = aggregate_table(new_table, by=[0,1,2,3], var="value")
|
|
23
|
+
modified_margins = modified_margins.rename(columns={"value":"target"})
|
|
24
|
+
|
|
25
|
+
# 2. Execution
|
|
26
|
+
# adjust the table in step1 to the margin obtained in step2
|
|
27
|
+
adjusted_table = ipf( input=input_table,
|
|
28
|
+
constraints=constraints,
|
|
29
|
+
targets=modified_margins,
|
|
30
|
+
unit_id="unit_id",
|
|
31
|
+
var="value",
|
|
32
|
+
cons_id="cons_id",
|
|
33
|
+
tol=0.1,
|
|
34
|
+
max_iter=1000)
|
|
35
|
+
# 3. Assertions
|
|
36
|
+
assert isinstance(adjusted_table, pd.DataFrame), f"Expected Pandas Dataframe, got {type(adjusted_table)}"
|
|
37
|
+
assert len(adjusted_table) > 0, "The returned table is empty"
|
|
38
|
+
|