IPFpy 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ from .ipf import ipf, generate_random_table, aggregate_table
2
+
3
+ __author__ = "Christian Gagné"
4
+ __version__ = "0.1.0"
5
+ __all__ = ["ipf", "generate_random_table", "aggregate_table"]
@@ -0,0 +1,402 @@
1
+ """
2
+ Python package to perform iterative proportional fitting on data tables (RAS, matrix scaling)
3
+ """
4
+
5
+ from itertools import combinations
6
+ import functools
7
+ from time import perf_counter
8
+ import itertools
9
+ import numpy as np
10
+ import pandas as pd
11
+ import duckdb
12
+
13
+ def generate_random_table(n_dim,
14
+ n_cat,
15
+ scale=1):
16
+ """
17
+ Generate a pandas dataframe with random values
18
+ """
19
+ #generate n_dim columns each with n_cat values
20
+ sets = [set(range(n_cat)) for _ in range(n_dim)]
21
+ cartesian_product = list(itertools.product(*sets))
22
+ df = pd.DataFrame(cartesian_product, columns=[*range(n_dim)])
23
+ #generate random values between 0 and scale
24
+ df["value"] = np.random.rand(len(df)) * scale
25
+ return df
26
+
27
+ def get_unique_col_name(df, base_name):
28
+ """
29
+ Generate a unique column name
30
+ """
31
+ i = 1
32
+ new_name = base_name
33
+ while new_name in df.columns:
34
+ new_name = f"{base_name}_{i}"
35
+ i += 1
36
+ return new_name
37
+
38
+ def agg_by_sql(df: pd.DataFrame,
39
+ by,
40
+ var,
41
+ id):
42
+ if by is None or not by:
43
+ # Aggregate over the entire dataset
44
+ query = f"""
45
+ SELECT
46
+ FSUM({var}) AS {var},
47
+ LIST({id}) AS {id}
48
+ FROM 'df'
49
+ """
50
+ else:
51
+ # Aggregate with grouping
52
+ group_by_columns = ", ".join(map(lambda x: '"'+str(x)+'"' if isinstance(x, int) else str(x) , by))
53
+ query = f"""
54
+ SELECT
55
+ {group_by_columns},
56
+ FSUM({var}) AS {var},
57
+ LIST({id}) AS {id}
58
+ FROM 'df'
59
+ GROUP BY {group_by_columns}
60
+ """
61
+ # Execute the query
62
+ with duckdb.connect() as con:
63
+ df_agg = con.execute(query).fetchdf()
64
+ return df_agg
65
+
66
+ def aggregate_and_list(df:pd.DataFrame,
67
+ by, var=None,
68
+ margins=None,
69
+ id=None):
70
+ """
71
+ Aggregate a DataFrame across all possible subsets of specified columns.
72
+
73
+ This function generates combinations of the 'by' columns, performs
74
+ aggregations for each subset, and concatenates the results into a
75
+ single DataFrame.
76
+
77
+ Args:
78
+ df (pd.DataFrame): The source data to be aggregated.
79
+ by (str or list): Column name(s) to group by and create combinations from.
80
+ var (str, optional): The variable/column name to be aggregated.
81
+ Defaults to None.
82
+ margins (list of lists, optional): Specific subsets of 'by' to include.
83
+ If provided, only these combinations are processed. Defaults to None.
84
+ id (str, optional): An identifier for the aggregation operation.
85
+ Defaults to None.
86
+
87
+ Returns:
88
+ pd.DataFrame: A concatenated DataFrame containing all subset aggregations.
89
+ """
90
+ if by is not None and not isinstance(by,list):
91
+ by = [by]
92
+
93
+ subsets=[]
94
+ if by is not None:
95
+ for i in range(0,len(by)):
96
+ comb = combinations(by,i)
97
+ subsets = subsets + [list(c) for c in comb]
98
+ else:
99
+ subsets=[[]]
100
+
101
+ if margins is not None:
102
+ subsets = [sub for sub in subsets if sub in margins]
103
+
104
+ df_out = pd.DataFrame()
105
+ for sub in subsets:
106
+ sub_agg = agg_by_sql(df, by=sub, var=var, id=id)
107
+ df_out = pd.concat([df_out,sub_agg],ignore_index=True)
108
+ return df_out
109
+
110
+ def aggregate_table(df_in,
111
+ by=None,
112
+ var=None,
113
+ margins=None):
114
+ """
115
+ aggreagate the input table into a table of the form
116
+
117
+ output:
118
+ Table
119
+ unit_id : identifier for the decision variables
120
+ weight : decision variables. >=0
121
+ lb : weight >= lb
122
+ ub : weight <= up
123
+
124
+ Table
125
+ unit_id : identifier for the decision variables
126
+ cons_id : identifiant des contraintes
127
+
128
+ """
129
+ # aggregate "var" by "by" columns
130
+ # in case there are duplicates in the input to make sure we have a table with signle entries per cell
131
+ by_values = df_in.groupby(by)[var].sum().reset_index()
132
+
133
+ # get a unique name not already present in the dataframe to store cell identifier
134
+ cell_id_name = get_unique_col_name(by_values,"unit_id")
135
+
136
+ # create a unique identifer for each cell of the table
137
+ by_values[cell_id_name] = range(len(by_values))
138
+ cell_id_lst = list(by_values[cell_id_name])
139
+ n_cells = len(cell_id_lst)
140
+
141
+ # get margins of the input table
142
+ df_margins = aggregate_and_list(by_values, by, var, margins, cell_id_name)
143
+ cons_id_name = get_unique_col_name(df_margins,"cons_id")
144
+ df_margins[cons_id_name] = range(len(df_margins))
145
+ n_margins = len(df_margins)
146
+
147
+ # create a mapping of each margin identifer to a list of each cell identifer adding up to it
148
+ constraints = df_margins.explode(cell_id_name).reset_index(drop=True)
149
+
150
+ return by_values, df_margins.drop([cell_id_name],axis=1), constraints[[cell_id_name,cons_id_name]]
151
+
152
+
153
+ def get_discrepancy(con):
154
+ """
155
+ returns the discrepancies between then aggregated margins and their target values
156
+
157
+ input: from the database connection con
158
+ table wrk_weights
159
+ table wrk_input_constraints
160
+ table wrk_input_targets
161
+
162
+ output: in the database connection con
163
+ table wrk_discrepancies
164
+ output:
165
+ value max_discrepancy
166
+
167
+ """
168
+ con.execute("""
169
+ CREATE OR REPLACE TABLE wrk_constraints AS
170
+ SELECT a.cons_id, fsum(b.weight) as aggregated_weight_per_constraint
171
+ FROM wrk_input_constraints AS a
172
+ LEFT JOIN wrk_weights AS b
173
+ ON a.unit_id=b.unit_id
174
+ GROUP by a.cons_id
175
+ ;
176
+ """)
177
+ con.execute("""
178
+ CREATE OR REPLACE TABLE wrk_discrepancies AS
179
+ SELECT a.cons_id, a.cons_type, a.target, b.aggregated_weight_per_constraint as target_approximation
180
+ FROM wrk_input_targets AS a
181
+ LEFT JOIN wrk_constraints AS b
182
+ ON a.cons_id = b.cons_id
183
+ ;
184
+ """)
185
+ con.execute("""
186
+ CREATE OR REPLACE TABLE wrk_discrepancies AS
187
+ SELECT *,
188
+ -- Step 1: Compute diff and adjustement
189
+ target - target_approximation AS diff,
190
+ target / target_approximation AS adjustement
191
+ FROM wrk_discrepancies;
192
+
193
+ -- Step 2: Apply constraints on adjustement and diff
194
+ UPDATE wrk_discrepancies
195
+ SET adjustement = CASE
196
+ WHEN cons_type = 'le' AND adjustement > 1 THEN 1
197
+ WHEN cons_type = 'ge' AND adjustement < 1 THEN 1
198
+ ELSE adjustement
199
+ END,
200
+ diff = CASE
201
+ WHEN cons_type = 'le' AND adjustement > 1 THEN 0
202
+ WHEN cons_type = 'ge' AND adjustement < 1 THEN 0
203
+ ELSE diff
204
+ END;
205
+ ;
206
+ """)
207
+ max_discrepancy = con.execute("SELECT max(abs(diff)) FROM wrk_discrepancies ;").fetchone()[0]
208
+ return max_discrepancy
209
+
210
+
211
+ def timer(func):
212
+ """
213
+ Decorator that return the duration of a function execution
214
+ """
215
+ @functools.wraps(func)
216
+ def wrapper_timer(*args, **kwargs):
217
+ tic = perf_counter()
218
+ value = func(*args, **kwargs)
219
+ toc = perf_counter()
220
+ elapsed_time = toc - tic
221
+ print(f"Elapsed time: {elapsed_time:0.4f} seconds")
222
+ return value
223
+ return wrapper_timer
224
+
225
+ @timer
226
+ def ipf(input=None,
227
+ constraints=None,
228
+ targets=None,
229
+ unit_id="unit_id",
230
+ var="weight",
231
+ cons_id="cons_id",
232
+ lb=None,
233
+ ub=None,
234
+ cons_type=None,
235
+ db_file=":memory:",
236
+ tol=1,
237
+ max_iter=100,
238
+ out_parquet=None,
239
+ out_csv=None,
240
+ silent=False):
241
+ """
242
+ input: table
243
+ This table lists all the cells or units in a table whose value will be adjusted by Iterative proportional fitting
244
+ along with boundaries whose adjusted value is meant to stay within.
245
+ unit_id : identifier for the decision variables
246
+ weight : decision variables. >=0
247
+ lb : weight >= lb
248
+ ub : weight <= up
249
+
250
+ constraints : table
251
+ This table maps for each constaint identifier, which unit_id to aggregate
252
+ unit_id : identifier for the decision variables
253
+ cons_id : identifier for each margin
254
+
255
+ targets : table
256
+ This table lists all the target values that the margins should add up to once adjusted
257
+ cons_id : identifier for each margin
258
+ cons_type : constraint must be greater or equal (ge) the target, lesser or equal (le), or equal (eq)
259
+ target : value for the constaint
260
+
261
+ db_file (optional ): name of the database file that will hold the temporary tables
262
+
263
+ out_parquet (optional): name path of the parquet output file
264
+ out_csv (optional) : name path of the csv output file
265
+
266
+ silent (optinal default false): Whether or not to print progress to screen
267
+
268
+ output : table
269
+ Output table lists all the initials cells/units along with their adjusted values.
270
+ untiId : identifier for the decision variables
271
+ weight : adjusted weight. Will fit in the interval lb <= weight <= ub
272
+
273
+ """
274
+ if not silent:
275
+ print()
276
+ print("-----------")
277
+ print("Calibration")
278
+ print("-----------")
279
+ print()
280
+
281
+
282
+ with duckdb.connect(db_file) as con:
283
+ # 1. Register the source as a virtual table named 'input_table'
284
+ if isinstance(input, pd.DataFrame):
285
+ con.register('input_table', input)
286
+ elif isinstance(input, str):
287
+ # DuckDB's read_auto handles CSV, Parquet, or JSON automatically
288
+ con.execute(f"CREATE OR REPLACE VIEW input_table AS SELECT * FROM '{input}'")
289
+
290
+ if isinstance(constraints, pd.DataFrame):
291
+ con.register('constraints_table', constraints)
292
+ elif isinstance(constraints, str):
293
+ # DuckDB's read_auto handles CSV, Parquet, or JSON automatically
294
+ con.execute(f"CREATE OR REPLACE VIEW constraints_table AS SELECT * FROM '{constraints}'")
295
+
296
+ if isinstance(targets, pd.DataFrame):
297
+ con.register('targets_table', targets)
298
+ elif isinstance(constraints, str):
299
+ # DuckDB's read_auto handles CSV, Parquet, or JSON automatically
300
+ con.execute(f"CREATE OR REPLACE VIEW targets_table AS SELECT * FROM '{targets}'")
301
+
302
+ # Collect the values from dataset targets
303
+ n_units = con.execute("SELECT COUNT(DISTINCT unit_id ) FROM input_table;").fetchone()[0]
304
+ n_var = con.execute("SELECT COUNT(DISTINCT cons_id ) FROM constraints_table;").fetchone()[0]
305
+
306
+ if not silent:
307
+ print(f"Number of equations: {n_var}")
308
+ print(f"Number of units : {n_units}")
309
+ print()
310
+
311
+ # set up the working table of weights to be adjusted
312
+ sql_select = f"SELECT {unit_id} as unit_id, {var} as weight"
313
+ if lb:
314
+ sql_select += ", lb"
315
+ if ub:
316
+ sql_select += ", ub"
317
+
318
+ con.execute(f"""
319
+ CREATE TABLE wrk_weights AS
320
+ {sql_select}
321
+ FROM input_table
322
+ """)
323
+
324
+ # read in the constraints
325
+ con.execute("""
326
+ CREATE TABLE wrk_input_constraints AS
327
+ SELECT unit_id, cons_id
328
+ FROM constraints_table
329
+ """)
330
+
331
+ # read in the target values for the constraints
332
+ sql_select = "SELECT cons_id, 'eq' as cons_type, target"
333
+ if cons_type:
334
+ sql_select = f"SELECT cons_id, {cons_type}, target"
335
+ con.execute(f"""
336
+ CREATE TABLE wrk_input_targets AS
337
+ {sql_select}
338
+ FROM targets_table
339
+ """)
340
+
341
+ # get the initial state of adjustment between the margins and the target margins
342
+ max_discrepancy = tol
343
+ max_discrepancy = get_discrepancy(con)
344
+ if not silent:
345
+ print(f"Initial max discrepancy : {max_discrepancy} ")
346
+
347
+ n_iter = 0
348
+ while ( ( (max_discrepancy >= tol) and (n_iter <= max_iter) ) ):
349
+ # for each unit_id, fetch the adjustment required by the constraint
350
+ con.execute(f"""
351
+ CREATE OR REPLACE TABLE wrk_constraints as
352
+ SELECT a.*, b.adjustement
353
+ FROM wrk_input_constraints as a
354
+ LEFT JOIN wrk_discrepancies as b
355
+ ON a.cons_id = b.cons_id
356
+ ;
357
+ """)
358
+ # compute the geometric mean of the adjustements to be made
359
+ con.execute(f"""
360
+ CREATE OR REPLACE TABLE wrk_unit_adjustement AS
361
+ SELECT unit_id, exp(mean(log(adjustement))) as adjust
362
+ FROM wrk_constraints
363
+ GROUP BY unit_id
364
+ """)
365
+ # adjust the weights
366
+ con.execute(f"""
367
+ CREATE OR REPLACE TABLE wrk_weights AS
368
+ SELECT a.* EXCLUDE weight, a.weight*b.adjust as weight
369
+ FROM wrk_weights as a
370
+ LEFT JOIN wrk_unit_adjustement as b
371
+ ON a.unit_id = b.unit_id
372
+ """)
373
+ # make sure the values are within bounds*/
374
+ if lb :
375
+ con.execute("""
376
+ CREATE OR REPLACE TABLE wrk_weights AS
377
+ SELECT *, GREATEST(weight, lb) AS weight
378
+ FROM wrk_weights
379
+ EXCLUDE weight_;
380
+ """)
381
+ if ub:
382
+ con.execute("""
383
+ CREATE OR REPLACE TABLE wrk_weights AS
384
+ SELECT *, LEAST(weight, ub) AS weight
385
+ FROM wrk_weights
386
+ EXCLUDE weight_;
387
+ """)
388
+
389
+ max_discrepancy = get_discrepancy(con)
390
+
391
+ if not silent:
392
+ print(f"iteration {n_iter} : {max_discrepancy}")
393
+ n_iter += 1
394
+
395
+ if out_parquet:
396
+ con.execute(f"COPY wrk_weights TO '{out_parquet}' (FORMAT PARQUET);")
397
+ if out_csv:
398
+ con.execute(f"COPY wrk_weights TO '{out_csv}' (HEADER, DELIMITER ',');")
399
+
400
+ if not (out_parquet or out_csv):
401
+ return con.execute("SELECT * FROM wrk_weights").fetchdf()
402
+
@@ -0,0 +1,131 @@
1
+ Metadata-Version: 2.4
2
+ Name: IPFpy
3
+ Version: 0.1.0
4
+ Summary: Performs iterative proportional fitting on tabular data
5
+ Home-page: https://github.com/veozen/IPF
6
+ Author: Christian Gagné
7
+ Author-email: christian.gagne@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: numpy>=1.23.5
14
+ Requires-Dist: pandas>=2.1.2
15
+ Requires-Dist: duckdb>=1.4.0
16
+ Dynamic: author
17
+ Dynamic: author-email
18
+ Dynamic: classifier
19
+ Dynamic: description
20
+ Dynamic: description-content-type
21
+ Dynamic: home-page
22
+ Dynamic: requires-dist
23
+ Dynamic: requires-python
24
+ Dynamic: summary
25
+
26
+ # IPFpy
27
+ Iterative proportionial fitting that can work with larger than memory tables.
28
+
29
+ inputs tables can be either pandas dataframes, .csv file or .parquet file
30
+
31
+ ```
32
+ input: table
33
+ Thif table lists all the cells or units in a table whose value will be adjusted by Iterative proportional fitting along with boundaries whose adjusted value is meant to stay within.
34
+ unit_id : identifier for the decision variables
35
+ weight : decision variables. >=0
36
+ lb : weight >= lb
37
+ ub : weight <= up
38
+
39
+ constraints : table
40
+ This table maps for each constaint identifier, which unit_id to aggregate
41
+ unit_id : identifier for the decision variables
42
+ cons_id : identifier for each marging
43
+
44
+ targets : table
45
+ This table lists all the target values that the margins should add up to once adjusted
46
+ cons_id : identifier for each marging
47
+ cons_type : constraint must be greater or equal (ge) the target, lesser or equal (le), or equal (eq)
48
+ target : value for the constaint
49
+
50
+ unit_id : name of the column that identifies each value to be adjusted (default "unit_id")
51
+ var : name of the column that contains the value to be adjusted (default "weight")
52
+ cons_id : name of the column that identifies each constraints (default "cons_id")
53
+
54
+ db_file (optional ): name of the database file on disc that will hold the temporary tables. Default is in memory
55
+
56
+ out_parquet (optional): name path of the parquet output file
57
+ out_csv (optional) : name path of the csv output file
58
+
59
+ silent (optinal default false): Whether or not to print progress to screen
60
+
61
+ output : table
62
+ Output table lists all the initials cells/units along with their adjusted values.
63
+ untiId : identifier for the decision variables
64
+ weight : adjusted weight. Will fit in the interval lb <= weight <= ub
65
+
66
+ ```
67
+
68
+ ## Example
69
+
70
+ ```python
71
+ from IPFpy import *
72
+ import numpy as np
73
+
74
+
75
+ # test IPF
76
+ #step1 - create a table and generate the margins as well as the file that maps the cells of the inner table to the margins
77
+ raw_table = generate_random_table(4,8,scale=2)
78
+ input_table, margins, constraints = aggregate_table(raw_table, by=[0,1,2,3], var="value")
79
+ margins = margins.rename(columns={"value":"target"}) #rename margin column
80
+
81
+ # step2 - modify the margins by adding noise to the inner cells
82
+ new_table = input_table.copy().drop("unit_id",axis=1)
83
+ new_table["value"] = input_table["value"] * np.random.uniform(0, 2, input_table.shape[0])
84
+ modified_table, modified_margins, constraints = aggregate_table(new_table, by=[0,1,2,3], var="value")
85
+ modified_margins = modified_margins.rename(columns={"value":"target"})
86
+
87
+ # write table as csv
88
+ input_table.to_csv('input_table.csv', index=False)
89
+ constraints.to_csv('constraints.csv', index=False)
90
+ modified_margins.to_csv('modified_margins.csv', index=False)
91
+
92
+
93
+ df.to_parquet('my_data.parquet', engine='pyarrow')
94
+
95
+ # adjust the table in step1 to the margin obtained in step2
96
+ adjusted_table = ipf( input=input_table,
97
+ constraints=constraints,
98
+ targets=modified_margins,
99
+ unit_id="unit_id",
100
+ var="value",
101
+ cons_id="cons_id",
102
+ db_file=None,
103
+ tol=0.1,
104
+ maxIter=1000)
105
+
106
+ # output to a file
107
+ ipf(input =input_table,
108
+ constraints =constraints,
109
+ targets =modified_margins,
110
+ unit_id ="unit_id",
111
+ var ="value",
112
+ cons_id ="cons_id",
113
+ tol =0.1,
114
+ maxIter =1000,
115
+ out_csv ="adjusted_table.csv",
116
+ silent=True)
117
+
118
+ # input directly from files
119
+ # paths to the input files have to be adjusted to correspond to the location of the input files
120
+ ipf(input ="/home/Desktop/Programming/IPF/IPF/input_table.csv",
121
+ constraints ="/home/Desktop/Programming/IPF/IPF/constraints.csv",
122
+ targets ="/home/Desktop/Programming/IPF/IPF/modified_margins.csv",
123
+ unit_id ="unit_id",
124
+ var ="value",
125
+ cons_id ="cons_id",
126
+ tol =0.1,
127
+ maxIter =1000,
128
+ out_csv ="adjusted_table.csv",
129
+ silent=True)
130
+
131
+ ```
@@ -0,0 +1,10 @@
1
+ README.md
2
+ setup.py
3
+ IPFpy/__init__.py
4
+ IPFpy/ipf.py
5
+ IPFpy.egg-info/PKG-INFO
6
+ IPFpy.egg-info/SOURCES.txt
7
+ IPFpy.egg-info/dependency_links.txt
8
+ IPFpy.egg-info/requires.txt
9
+ IPFpy.egg-info/top_level.txt
10
+ tests/test_ipf.py
@@ -0,0 +1,3 @@
1
+ numpy>=1.23.5
2
+ pandas>=2.1.2
3
+ duckdb>=1.4.0
@@ -0,0 +1 @@
1
+ IPFpy
ipfpy-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,131 @@
1
+ Metadata-Version: 2.4
2
+ Name: IPFpy
3
+ Version: 0.1.0
4
+ Summary: Performs iterative proportional fitting on tabular data
5
+ Home-page: https://github.com/veozen/IPF
6
+ Author: Christian Gagné
7
+ Author-email: christian.gagne@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: numpy>=1.23.5
14
+ Requires-Dist: pandas>=2.1.2
15
+ Requires-Dist: duckdb>=1.4.0
16
+ Dynamic: author
17
+ Dynamic: author-email
18
+ Dynamic: classifier
19
+ Dynamic: description
20
+ Dynamic: description-content-type
21
+ Dynamic: home-page
22
+ Dynamic: requires-dist
23
+ Dynamic: requires-python
24
+ Dynamic: summary
25
+
26
+ # IPFpy
27
+ Iterative proportionial fitting that can work with larger than memory tables.
28
+
29
+ inputs tables can be either pandas dataframes, .csv file or .parquet file
30
+
31
+ ```
32
+ input: table
33
+ Thif table lists all the cells or units in a table whose value will be adjusted by Iterative proportional fitting along with boundaries whose adjusted value is meant to stay within.
34
+ unit_id : identifier for the decision variables
35
+ weight : decision variables. >=0
36
+ lb : weight >= lb
37
+ ub : weight <= up
38
+
39
+ constraints : table
40
+ This table maps for each constaint identifier, which unit_id to aggregate
41
+ unit_id : identifier for the decision variables
42
+ cons_id : identifier for each marging
43
+
44
+ targets : table
45
+ This table lists all the target values that the margins should add up to once adjusted
46
+ cons_id : identifier for each marging
47
+ cons_type : constraint must be greater or equal (ge) the target, lesser or equal (le), or equal (eq)
48
+ target : value for the constaint
49
+
50
+ unit_id : name of the column that identifies each value to be adjusted (default "unit_id")
51
+ var : name of the column that contains the value to be adjusted (default "weight")
52
+ cons_id : name of the column that identifies each constraints (default "cons_id")
53
+
54
+ db_file (optional ): name of the database file on disc that will hold the temporary tables. Default is in memory
55
+
56
+ out_parquet (optional): name path of the parquet output file
57
+ out_csv (optional) : name path of the csv output file
58
+
59
+ silent (optinal default false): Whether or not to print progress to screen
60
+
61
+ output : table
62
+ Output table lists all the initials cells/units along with their adjusted values.
63
+ untiId : identifier for the decision variables
64
+ weight : adjusted weight. Will fit in the interval lb <= weight <= ub
65
+
66
+ ```
67
+
68
+ ## Example
69
+
70
+ ```python
71
+ from IPFpy import *
72
+ import numpy as np
73
+
74
+
75
+ # test IPF
76
+ #step1 - create a table and generate the margins as well as the file that maps the cells of the inner table to the margins
77
+ raw_table = generate_random_table(4,8,scale=2)
78
+ input_table, margins, constraints = aggregate_table(raw_table, by=[0,1,2,3], var="value")
79
+ margins = margins.rename(columns={"value":"target"}) #rename margin column
80
+
81
+ # step2 - modify the margins by adding noise to the inner cells
82
+ new_table = input_table.copy().drop("unit_id",axis=1)
83
+ new_table["value"] = input_table["value"] * np.random.uniform(0, 2, input_table.shape[0])
84
+ modified_table, modified_margins, constraints = aggregate_table(new_table, by=[0,1,2,3], var="value")
85
+ modified_margins = modified_margins.rename(columns={"value":"target"})
86
+
87
+ # write table as csv
88
+ input_table.to_csv('input_table.csv', index=False)
89
+ constraints.to_csv('constraints.csv', index=False)
90
+ modified_margins.to_csv('modified_margins.csv', index=False)
91
+
92
+
93
+ df.to_parquet('my_data.parquet', engine='pyarrow')
94
+
95
+ # adjust the table in step1 to the margin obtained in step2
96
+ adjusted_table = ipf( input=input_table,
97
+ constraints=constraints,
98
+ targets=modified_margins,
99
+ unit_id="unit_id",
100
+ var="value",
101
+ cons_id="cons_id",
102
+ db_file=None,
103
+ tol=0.1,
104
+ maxIter=1000)
105
+
106
+ # output to a file
107
+ ipf(input =input_table,
108
+ constraints =constraints,
109
+ targets =modified_margins,
110
+ unit_id ="unit_id",
111
+ var ="value",
112
+ cons_id ="cons_id",
113
+ tol =0.1,
114
+ maxIter =1000,
115
+ out_csv ="adjusted_table.csv",
116
+ silent=True)
117
+
118
+ # input directly from files
119
+ # paths to the input files have to be adjusted to correspond to the location of the input files
120
+ ipf(input ="/home/Desktop/Programming/IPF/IPF/input_table.csv",
121
+ constraints ="/home/Desktop/Programming/IPF/IPF/constraints.csv",
122
+ targets ="/home/Desktop/Programming/IPF/IPF/modified_margins.csv",
123
+ unit_id ="unit_id",
124
+ var ="value",
125
+ cons_id ="cons_id",
126
+ tol =0.1,
127
+ maxIter =1000,
128
+ out_csv ="adjusted_table.csv",
129
+ silent=True)
130
+
131
+ ```
ipfpy-0.1.0/README.md ADDED
@@ -0,0 +1,106 @@
1
+ # IPFpy
2
+ Iterative proportionial fitting that can work with larger than memory tables.
3
+
4
+ inputs tables can be either pandas dataframes, .csv file or .parquet file
5
+
6
+ ```
7
+ input: table
8
+ Thif table lists all the cells or units in a table whose value will be adjusted by Iterative proportional fitting along with boundaries whose adjusted value is meant to stay within.
9
+ unit_id : identifier for the decision variables
10
+ weight : decision variables. >=0
11
+ lb : weight >= lb
12
+ ub : weight <= up
13
+
14
+ constraints : table
15
+ This table maps for each constaint identifier, which unit_id to aggregate
16
+ unit_id : identifier for the decision variables
17
+ cons_id : identifier for each marging
18
+
19
+ targets : table
20
+ This table lists all the target values that the margins should add up to once adjusted
21
+ cons_id : identifier for each marging
22
+ cons_type : constraint must be greater or equal (ge) the target, lesser or equal (le), or equal (eq)
23
+ target : value for the constaint
24
+
25
+ unit_id : name of the column that identifies each value to be adjusted (default "unit_id")
26
+ var : name of the column that contains the value to be adjusted (default "weight")
27
+ cons_id : name of the column that identifies each constraints (default "cons_id")
28
+
29
+ db_file (optional ): name of the database file on disc that will hold the temporary tables. Default is in memory
30
+
31
+ out_parquet (optional): name path of the parquet output file
32
+ out_csv (optional) : name path of the csv output file
33
+
34
+ silent (optinal default false): Whether or not to print progress to screen
35
+
36
+ output : table
37
+ Output table lists all the initials cells/units along with their adjusted values.
38
+ untiId : identifier for the decision variables
39
+ weight : adjusted weight. Will fit in the interval lb <= weight <= ub
40
+
41
+ ```
42
+
43
+ ## Example
44
+
45
+ ```python
46
+ from IPFpy import *
47
+ import numpy as np
48
+
49
+
50
+ # test IPF
51
+ #step1 - create a table and generate the margins as well as the file that maps the cells of the inner table to the margins
52
+ raw_table = generate_random_table(4,8,scale=2)
53
+ input_table, margins, constraints = aggregate_table(raw_table, by=[0,1,2,3], var="value")
54
+ margins = margins.rename(columns={"value":"target"}) #rename margin column
55
+
56
+ # step2 - modify the margins by adding noise to the inner cells
57
+ new_table = input_table.copy().drop("unit_id",axis=1)
58
+ new_table["value"] = input_table["value"] * np.random.uniform(0, 2, input_table.shape[0])
59
+ modified_table, modified_margins, constraints = aggregate_table(new_table, by=[0,1,2,3], var="value")
60
+ modified_margins = modified_margins.rename(columns={"value":"target"})
61
+
62
+ # write table as csv
63
+ input_table.to_csv('input_table.csv', index=False)
64
+ constraints.to_csv('constraints.csv', index=False)
65
+ modified_margins.to_csv('modified_margins.csv', index=False)
66
+
67
+
68
+ df.to_parquet('my_data.parquet', engine='pyarrow')
69
+
70
+ # adjust the table in step1 to the margin obtained in step2
71
+ adjusted_table = ipf( input=input_table,
72
+ constraints=constraints,
73
+ targets=modified_margins,
74
+ unit_id="unit_id",
75
+ var="value",
76
+ cons_id="cons_id",
77
+ db_file=None,
78
+ tol=0.1,
79
+ maxIter=1000)
80
+
81
+ # output to a file
82
+ ipf(input =input_table,
83
+ constraints =constraints,
84
+ targets =modified_margins,
85
+ unit_id ="unit_id",
86
+ var ="value",
87
+ cons_id ="cons_id",
88
+ tol =0.1,
89
+ maxIter =1000,
90
+ out_csv ="adjusted_table.csv",
91
+ silent=True)
92
+
93
+ # input directly from files
94
+ # paths to the input files have to be adjusted to correspond to the location of the input files
95
+ ipf(input ="/home/Desktop/Programming/IPF/IPF/input_table.csv",
96
+ constraints ="/home/Desktop/Programming/IPF/IPF/constraints.csv",
97
+ targets ="/home/Desktop/Programming/IPF/IPF/modified_margins.csv",
98
+ unit_id ="unit_id",
99
+ var ="value",
100
+ cons_id ="cons_id",
101
+ tol =0.1,
102
+ maxIter =1000,
103
+ out_csv ="adjusted_table.csv",
104
+ silent=True)
105
+
106
+ ```
ipfpy-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
ipfpy-0.1.0/setup.py ADDED
@@ -0,0 +1,31 @@
1
+ """
2
+ Setup configuration for the IPFpy package.
3
+
4
+ This script handles the packaging, dependency management, and metadata
5
+ required to distribute IPFpy via PyPI.
6
+ """
7
+
8
+ from setuptools import setup, find_packages
9
+
10
+ setup(
11
+ name='IPFpy',
12
+ version='0.1.0',
13
+ packages=find_packages(),
14
+ install_requires=[
15
+ 'numpy>=1.23.5',
16
+ 'pandas>=2.1.2',
17
+ 'duckdb>=1.4.0',
18
+ ],
19
+ author='Christian Gagné',
20
+ author_email='christian.gagne@gmail.com',
21
+ description='Performs iterative proportional fitting on tabular data',
22
+ long_description=open('README.md').read(),
23
+ long_description_content_type='text/markdown',
24
+ url='https://github.com/veozen/IPF',
25
+ classifiers=[
26
+ 'Programming Language :: Python :: 3',
27
+ 'License :: OSI Approved :: MIT License',
28
+ 'Operating System :: OS Independent',
29
+ ],
30
+ python_requires='>=3.10',
31
+ )
@@ -0,0 +1,38 @@
1
+ """
2
+ Tests for IPFpy package
3
+ """
4
+
5
+ import pytest
6
+ import pandas as pd
7
+ import numpy as np
8
+ from IPFpy import ipf, generate_random_table, aggregate_table
9
+
10
+ def test_ipf_basic_execution():
11
+ """
12
+ Tests that ipf produces a result for a random 4 dimensional table where each dimension has 8 categories
13
+ """
14
+ #step1 - create a table and generate the margins as well as the file that maps the cells of the inner table to the margins
15
+ raw_table = generate_random_table(4,8,scale=2)
16
+ input_table, margins, constraints = aggregate_table(raw_table, by=[0,1,2,3], var="value")
17
+ margins = margins.rename(columns={"value":"target"}) #rename margin column
18
+
19
+ # step2 - modify the margins by adding noise to the inner cells
20
+ new_table = input_table.copy().drop("unit_id",axis=1)
21
+ new_table["value"] = input_table["value"] * np.random.uniform(0, 2, input_table.shape[0])
22
+ _, modified_margins, constraints = aggregate_table(new_table, by=[0,1,2,3], var="value")
23
+ modified_margins = modified_margins.rename(columns={"value":"target"})
24
+
25
+ # 2. Execution
26
+ # adjust the table in step1 to the margin obtained in step2
27
+ adjusted_table = ipf( input=input_table,
28
+ constraints=constraints,
29
+ targets=modified_margins,
30
+ unit_id="unit_id",
31
+ var="value",
32
+ cons_id="cons_id",
33
+ tol=0.1,
34
+ max_iter=1000)
35
+ # 3. Assertions
36
+ assert isinstance(adjusted_table, pd.DataFrame), f"Expected Pandas Dataframe, got {type(adjusted_table)}"
37
+ assert len(adjusted_table) > 0, "The returned table is empty"
38
+