pyretailscience 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
File without changes
@@ -0,0 +1,46 @@
1
+ import sys
2
+
3
+ import click
4
+ from loguru import logger
5
+
6
+ from pyretailscience.data import simulation
7
+
8
+
9
+ @click.command()
10
+ @click.option("--config_file", type=click.Path(dir_okay=False))
11
+ @click.option("--verbose", type=bool, default=False)
12
+ @click.option("--seed", default=1234, type=int)
13
+ @click.argument("output_file", type=click.Path(dir_okay=False))
14
+ def generate(
15
+ config_file: str,
16
+ verbose: bool,
17
+ seed: int,
18
+ output_file: str,
19
+ ):
20
+ """Generate a CSV file with random transaction data.
21
+
22
+ args:
23
+ config_file (str): Configuration file for the simulation
24
+ verbose (bool): Whether to print debug messages
25
+ seed (int): random seed
26
+ output_file (str): File to write the transactions to in parquet format
27
+ """
28
+
29
+ # Set logging level to info
30
+ logger.remove()
31
+ if verbose:
32
+ logger.add(sys.stderr, level="DEBUG")
33
+ else:
34
+ logger.add(sys.stderr, level="INFO")
35
+
36
+ logger.info("Generating data...")
37
+
38
+ sim = simulation.Simulation(seed=seed, config_file=config_file)
39
+ sim.run()
40
+ sim.save_transactions(output_file)
41
+
42
+ logger.info("Done!")
43
+
44
+
45
+ if __name__ == "__main__":
46
+ generate()
@@ -0,0 +1,300 @@
1
+ import uuid
2
+ from dataclasses import dataclass
3
+ from datetime import date, datetime, time, timedelta
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import strictyaml as yaml
8
+ from loguru import logger
9
+ from tqdm import tqdm
10
+
11
+ # Configuration schema for the config file yaml
12
+ schema = yaml.Map(
13
+ {
14
+ "stores": yaml.Map(
15
+ {
16
+ "number_of_stores": yaml.Int(),
17
+ }
18
+ ),
19
+ "transactions": yaml.Map(
20
+ {
21
+ "start_date": yaml.Datetime(),
22
+ "end_date": yaml.Datetime(),
23
+ "start_hour": yaml.Int(),
24
+ "end_hour": yaml.Int(),
25
+ "max_products_per_transaction": yaml.Int(),
26
+ }
27
+ ),
28
+ "customers": yaml.Map(
29
+ {
30
+ "starting_number_of_customers": yaml.Int(),
31
+ "churn_probability": yaml.Float(),
32
+ "average_time_between_purchases": yaml.Int(),
33
+ "average_new_customers_per_day": yaml.Int(),
34
+ }
35
+ ),
36
+ "products": yaml.Seq(
37
+ yaml.Map(
38
+ {
39
+ "category_0": yaml.Str(),
40
+ "category_0_id": yaml.Int(),
41
+ "subcategories": yaml.Seq(
42
+ yaml.Map(
43
+ {
44
+ "category_1": yaml.Str(),
45
+ "category_1_id": yaml.Int(),
46
+ "brands": yaml.Seq(
47
+ yaml.Map(
48
+ {
49
+ "brand": yaml.Str(),
50
+ "brand_id": yaml.Int(),
51
+ "products": yaml.Seq(
52
+ yaml.Map(
53
+ {
54
+ "product_name": yaml.Str(),
55
+ "product_id": yaml.Int(),
56
+ "unit_price": yaml.Float(),
57
+ }
58
+ )
59
+ ),
60
+ }
61
+ )
62
+ ),
63
+ }
64
+ )
65
+ ),
66
+ }
67
+ )
68
+ ),
69
+ }
70
+ )
71
+
72
+
73
+ def _random_time(rnd_generator: np.random.Generator, start_hour, end_hour):
74
+ hour = rnd_generator.integers(start_hour, end_hour)
75
+ minute = rnd_generator.integers(0, 60)
76
+ second = rnd_generator.integers(0, 60)
77
+
78
+ return time(hour, minute, second)
79
+
80
+
81
+ @dataclass
82
+ class Product:
83
+ category_0: str
84
+ category_0_id: int
85
+ category_1: str
86
+ category_1_id: int
87
+ brand: str
88
+ brand_id: int
89
+ product_name: str
90
+ product_id: int
91
+ unit_price: float
92
+ quantity_mean: int
93
+
94
+
95
+ class TransactionGenerator:
96
+ """Generates a random transaction for a customer."""
97
+
98
+ def __init__(
99
+ self,
100
+ rnd_generator: np.random.Generator,
101
+ num_stores: int,
102
+ max_products_per_transaction: int,
103
+ products: list[Product],
104
+ start_hour: int,
105
+ end_hour: int,
106
+ ):
107
+ self.num_stores = num_stores
108
+ self.rnd_generator = rnd_generator
109
+ self.products = products
110
+ self.max_products_per_transaction = max_products_per_transaction
111
+ self.start_hour = start_hour
112
+ self.end_hour = end_hour
113
+
114
+ def generate_transaction(self, customer_id: int, simulation_date: date) -> dict:
115
+ # Combine transaction_date with a random time
116
+ simulation_datetime = datetime.combine(
117
+ simulation_date,
118
+ _random_time(
119
+ rnd_generator=self.rnd_generator,
120
+ start_hour=self.start_hour,
121
+ end_hour=self.end_hour,
122
+ ),
123
+ )
124
+
125
+ store_id = self.rnd_generator.integers(1, self.num_stores)
126
+ products = self.rnd_generator.choice(
127
+ self.products, size=self.rnd_generator.integers(1, self.max_products_per_transaction)
128
+ )
129
+ quantity = self.rnd_generator.integers(1, self.max_products_per_transaction)
130
+
131
+ # Generate a UUID for now, but we'll change it to a sequential integer later
132
+ transaction_id = str(uuid.uuid4())
133
+
134
+ transaction_lines = []
135
+ for product in products:
136
+ quantity = self.rnd_generator.poisson(product.quantity_mean)
137
+ total_price = float(product.unit_price) * quantity
138
+
139
+ transaction = {
140
+ "transaction_id": transaction_id,
141
+ "date": simulation_datetime,
142
+ "customer_id": customer_id,
143
+ "product_id": product.product_id,
144
+ "product_name": product.product_name,
145
+ "category_0": product.category_0,
146
+ "category_0_id": product.category_0_id,
147
+ "category_1": product.category_1,
148
+ "category_1_id": product.category_1_id,
149
+ "brand": product.brand,
150
+ "brand_id": product.brand_id,
151
+ "unit_price": float(product.unit_price),
152
+ "quantity": quantity,
153
+ "total_price": total_price,
154
+ "store_id": store_id,
155
+ }
156
+
157
+ transaction_lines.append(transaction)
158
+
159
+ return transaction_lines
160
+
161
+
162
+ class Customer:
163
+ has_churned = False
164
+
165
+ def __init__(
166
+ self,
167
+ rnd_generator: np.random.Generator,
168
+ churn_prob: float,
169
+ customer_id: int,
170
+ transaction_gen: TransactionGenerator,
171
+ period_between_purchases: int,
172
+ ):
173
+ self.rnd_generator = rnd_generator
174
+ self.churn_prob = churn_prob
175
+ self.id = customer_id
176
+ self.transaction_gen = transaction_gen
177
+ self.periods_between_purchases = period_between_purchases
178
+ self.transactions = []
179
+
180
+ self.time_to_next_purchase = self.rnd_generator.poisson(period_between_purchases)
181
+
182
+ def step(self, date: date = None) -> None:
183
+ if self.has_churned:
184
+ return
185
+
186
+ if self.time_to_next_purchase == 0: # time to buy!
187
+ logger.debug("Customer made a purchase")
188
+ purchase = self.transaction_gen.generate_transaction(self.id, date)
189
+ self.transactions.extend(purchase)
190
+
191
+ # Bernoulli trial to see if customer churns
192
+ if self.rnd_generator.binomial(1, self.churn_prob):
193
+ self.has_churned = True
194
+ logger.debug(f"Customer {self.id} churned")
195
+ else:
196
+ self.time_to_next_purchase = self.rnd_generator.poisson(self.periods_between_purchases)
197
+ else:
198
+ self.time_to_next_purchase -= 1
199
+
200
+
201
+ class Simulation:
202
+ def __init__(
203
+ self,
204
+ seed: int,
205
+ config_file: str,
206
+ ):
207
+ self.seed = seed
208
+ with open(config_file, "r") as f:
209
+ try:
210
+ self.config = yaml.load(f.read(), schema).data
211
+ except yaml.YAMLError as error:
212
+ logger.error(error)
213
+ raise error
214
+
215
+ self.rnd_generator = np.random.default_rng(self.seed)
216
+
217
+ self.products = self._load_products()
218
+
219
+ self.customers = [
220
+ self._create_customer(customer_id=customer_id)
221
+ for customer_id in range(1, self.config["customers"]["starting_number_of_customers"] + 1)
222
+ ]
223
+ self.transactions = []
224
+
225
+ def step(self, date: date) -> None:
226
+ num_new_customers = self.rnd_generator.poisson(self.config["customers"]["average_new_customers_per_day"])
227
+ logger.debug(f"Adding {num_new_customers} new customers")
228
+ self.customers.extend(
229
+ [
230
+ self._create_customer(customer_id=new_customer_id)
231
+ for new_customer_id in range(len(self.customers) + 1, num_new_customers + 1)
232
+ ]
233
+ )
234
+ # Simulate each customer
235
+ for customer in self.customers:
236
+ customer.step(date)
237
+
238
+ def run(self) -> None:
239
+ start_date = self.config["transactions"]["start_date"].date()
240
+ end_date = self.config["transactions"]["end_date"].date()
241
+ days_in_simulation = [start_date + timedelta(n) for n in range((end_date - start_date).days)]
242
+ for sim_day in tqdm(days_in_simulation, desc="Simulating days"):
243
+ self.step(sim_day)
244
+
245
+ transactions = []
246
+ for customer in self.customers:
247
+ transactions.extend(customer.transactions)
248
+
249
+ # Change transactions UUIDs to sequential integers
250
+ unique_transaction_ids = set([t["transaction_id"] for t in transactions])
251
+ transaction_id_map = {transaction_id: i for i, transaction_id in enumerate(unique_transaction_ids)}
252
+ for transaction in transactions:
253
+ transaction["transaction_id"] = transaction_id_map[transaction["transaction_id"]]
254
+
255
+ self.transactions = transactions
256
+
257
+ def _create_customer(self, customer_id: int) -> Customer:
258
+ return Customer(
259
+ rnd_generator=self.rnd_generator,
260
+ churn_prob=self.config["customers"]["churn_probability"],
261
+ customer_id=customer_id,
262
+ period_between_purchases=self.config["customers"]["average_time_between_purchases"],
263
+ transaction_gen=TransactionGenerator(
264
+ rnd_generator=self.rnd_generator,
265
+ num_stores=self.config["stores"]["number_of_stores"],
266
+ max_products_per_transaction=self.config["transactions"]["max_products_per_transaction"],
267
+ products=self.products,
268
+ start_hour=self.config["transactions"]["start_hour"],
269
+ end_hour=self.config["transactions"]["end_hour"],
270
+ ),
271
+ )
272
+
273
+ def _load_products(self) -> list[Product]:
274
+ products = []
275
+
276
+ for category_0 in self.config["products"]:
277
+ for category_1 in category_0["subcategories"]:
278
+ for brand in category_1["brands"]:
279
+ for product in brand["products"]:
280
+ products.append(
281
+ Product(
282
+ category_0=category_0["category_0"],
283
+ category_0_id=int(category_0["category_0_id"]),
284
+ category_1=category_1["category_1"],
285
+ category_1_id=int(category_1["category_1_id"]),
286
+ brand=brand["brand"],
287
+ brand_id=int(brand["brand_id"]),
288
+ product_name=product["product_name"],
289
+ product_id=int(product["product_id"]),
290
+ unit_price=product["unit_price"],
291
+ # TODO: Move this to the config file
292
+ quantity_mean=self.rnd_generator.poisson(self.rnd_generator.integers(1, 3)),
293
+ )
294
+ )
295
+ return products
296
+
297
+ def save_transactions(self, output_file: str) -> None:
298
+ df = pd.DataFrame(self.transactions)
299
+ logger.info(f"Saving {len(df)} transactions to {output_file}")
300
+ df.to_parquet(output_file, index=False)
@@ -0,0 +1,93 @@
1
+ Elastic License 2.0
2
+
3
+ URL: https://www.elastic.co/licensing/elastic-license
4
+
5
+ ## Acceptance
6
+
7
+ By using the software, you agree to all of the terms and conditions below.
8
+
9
+ ## Copyright License
10
+
11
+ The licensor grants you a non-exclusive, royalty-free, worldwide,
12
+ non-sublicensable, non-transferable license to use, copy, distribute, make
13
+ available, and prepare derivative works of the software, in each case subject to
14
+ the limitations and conditions below.
15
+
16
+ ## Limitations
17
+
18
+ You may not provide the software to third parties as a hosted or managed
19
+ service, where the service provides users with access to any substantial set of
20
+ the features or functionality of the software.
21
+
22
+ You may not move, change, disable, or circumvent the license key functionality
23
+ in the software, and you may not remove or obscure any functionality in the
24
+ software that is protected by the license key.
25
+
26
+ You may not alter, remove, or obscure any licensing, copyright, or other notices
27
+ of the licensor in the software. Any use of the licensor’s trademarks is subject
28
+ to applicable law.
29
+
30
+ ## Patents
31
+
32
+ The licensor grants you a license, under any patent claims the licensor can
33
+ license, or becomes able to license, to make, have made, use, sell, offer for
34
+ sale, import and have imported the software, in each case subject to the
35
+ limitations and conditions in this license. This license does not cover any
36
+ patent claims that you cause to be infringed by modifications or additions to
37
+ the software. If you or your company make any written claim that the software
38
+ infringes or contributes to infringement of any patent, your patent license for
39
+ the software granted under these terms ends immediately. If your company makes
40
+ such a claim, your patent license ends immediately for work on behalf of your
41
+ company.
42
+
43
+ ## Notices
44
+
45
+ You must ensure that anyone who gets a copy of any part of the software from you
46
+ also gets a copy of these terms.
47
+
48
+ If you modify the software, you must include in any modified copies of the
49
+ software prominent notices stating that you have modified the software.
50
+
51
+ ## No Other Rights
52
+
53
+ These terms do not imply any licenses other than those expressly granted in
54
+ these terms.
55
+
56
+ ## Termination
57
+
58
+ If you use the software in violation of these terms, such use is not licensed,
59
+ and your licenses will automatically terminate. If the licensor provides you
60
+ with a notice of your violation, and you cease all violation of this license no
61
+ later than 30 days after you receive that notice, your licenses will be
62
+ reinstated retroactively. However, if you violate these terms after such
63
+ reinstatement, any additional violation of these terms will cause your licenses
64
+ to terminate automatically and permanently.
65
+
66
+ ## No Liability
67
+
68
+ *As far as the law allows, the software comes as is, without any warranty or
69
+ condition, and the licensor will not be liable to you for any damages arising
70
+ out of these terms or the use or nature of the software, under any kind of
71
+ legal claim.*
72
+
73
+ ## Definitions
74
+
75
+ The **licensor** is the entity offering these terms, and the **software** is the
76
+ software the licensor makes available under these terms, including any portion
77
+ of it.
78
+
79
+ **you** refers to the individual or entity agreeing to these terms.
80
+
81
+ **your company** is any legal entity, sole proprietorship, or other kind of
82
+ organization that you work for, plus all organizations that have control over,
83
+ are under the control of, or are under common control with that
84
+ organization. **control** means ownership of substantially all the assets of an
85
+ entity, or the power to direct its management and policies by vote, contract, or
86
+ otherwise. Control can be direct or indirect.
87
+
88
+ **your licenses** are all the licenses granted to you for the software under
89
+ these terms.
90
+
91
+ **use** means anything you do with the software requiring one of your licenses.
92
+
93
+ **trademark** means trademarks, service marks, and similar rights.
@@ -0,0 +1,29 @@
1
+ Metadata-Version: 2.1
2
+ Name: pyretailscience
3
+ Version: 0.1.0
4
+ Summary: Retail Data Science Tools
5
+ License: Elastic-2.0
6
+ Author: Murray Vanwyk
7
+ Author-email: 2493311+mvanwyk@users.noreply.github.com
8
+ Requires-Python: >=3.10,<3.12
9
+ Classifier: License :: Other/Proprietary License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Requires-Dist: click (>=8.1.7,<9.0.0)
14
+ Requires-Dist: loguru (>=0.7.2,<0.8.0)
15
+ Requires-Dist: matplotlib (>=3.8.2,<4.0.0)
16
+ Requires-Dist: numpy (>=1.26.3,<2.0.0)
17
+ Requires-Dist: pandas (>=2.1.4,<3.0.0)
18
+ Requires-Dist: pyarrow (>=14.0.2,<15.0.0)
19
+ Requires-Dist: seaborn (>=0.13.1,<0.14.0)
20
+ Requires-Dist: strictyaml (>=1.7.3,<2.0.0)
21
+ Description-Content-Type: text/markdown
22
+
23
+ ![pyretailscience logo](logo.png)
24
+
25
+ # pyretailscience
26
+
27
+ ⚡ Democratizing retail data analytics for all retailers ⚡
28
+
29
+ pyretailscience is a Python package designed for performing analytics on retail data. Additionally, the package includes functionality for generating test data to facilitate testing and development.
@@ -0,0 +1,9 @@
1
+ pyretailscience/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ pyretailscience/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ pyretailscience/data/cli.py,sha256=GxLUcRPbRbdDfYnH4JwQjqv2oHDlo9ub3Tzty61BsZA,1148
4
+ pyretailscience/data/simulation.py,sha256=F6hUvQN8Eg3nq_EPUsu1j9pzOmMLGlV6JwmfmL2FC1Q,11265
5
+ pyretailscience-0.1.0.dist-info/LICENSE,sha256=ZPwVR73Biwm3sK6vR54djCrhaRiM4cAD2zvOQZV8Xis,3859
6
+ pyretailscience-0.1.0.dist-info/METADATA,sha256=VbYi6PnF_FsFtn00NQBqESyEFEgZTKBsgAePPARgRRI,1103
7
+ pyretailscience-0.1.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
8
+ pyretailscience-0.1.0.dist-info/entry_points.txt,sha256=VJFsbNuCLiNNiOxZ_JYZ_4ZerEQ5rRTpc5jfvkh8Fys,69
9
+ pyretailscience-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 1.8.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ pyretailscience=pyretailscience.data.cli:generate
3
+