datasourcelib 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -300,7 +300,6 @@ class AzureDevOpsSource(DataSourceBase):
300
300
 
301
301
  # Display name and url
302
302
  display_name = page.get("name") or page.get("pageName") or page_path.strip("/") or "/"
303
- new_display_name = self.sanitize(display_name.replace(" ", "_").strip()),
304
303
  url = (
305
304
  page.get("remoteUrl")
306
305
  or page.get("url")
@@ -343,7 +342,7 @@ class AzureDevOpsSource(DataSourceBase):
343
342
  # Construct a 'full' description string using available pieces
344
343
  content_text = BeautifulSoup(content_text or "", "html.parser").get_text(),
345
344
  parts = []
346
- if new_display_name:
345
+ if display_name:
347
346
  parts.append(f"Wiki Page Name is {display_name}. Page has information about {display_name}")
348
347
  if project_name:
349
348
  parts.append(f"This page is documented by for Project '{project_name}' and by the team '{project_name}'")
@@ -354,7 +353,8 @@ class AzureDevOpsSource(DataSourceBase):
354
353
 
355
354
  index_content = ". ".join(parts)
356
355
  results.append({
357
- "display_name": new_display_name,
356
+ "display_name": self.sanitize(display_name.replace(" ", "_").strip()),
357
+ "page_name": display_name,
358
358
  "url": url,
359
359
  "content": index_content,
360
360
  "project": project_name
@@ -2,6 +2,7 @@
2
2
  import pandas as pd
3
3
  from string import Formatter
4
4
  from typing import Iterable, Any, Dict, List, Optional, Union
5
+ import regex as re
5
6
 
6
7
  def _placeholders(fmt: str) -> List[str]:
7
8
  """
@@ -12,6 +13,12 @@ def _placeholders(fmt: str) -> List[str]:
12
13
  def _safe_str(x) -> str:
13
14
  return "" if pd.isna(x) else str(x).strip()
14
15
 
16
+ @staticmethod
17
+ def sanitize(s: str) -> str:
18
+ """Keep only A-Z a-z 0-9 underscore/dash/equals in a safe way."""
19
+ # using the `regex` import already present as `re`
20
+ return re.sub(r'[^A-Za-z0-9_\-=]', '', s)
21
+
15
22
  def generate_grouped_summaries(
16
23
  df: pd.DataFrame,
17
24
  aggregation_field: str,
@@ -146,7 +153,7 @@ def generate_grouped_summaries(
146
153
 
147
154
  content = header + " " + " ".join(lines)
148
155
  summaries.append(
149
- {"content" : content, "id": group_value}
156
+ {"content" : content, "id": sanitize(group_value.replace(" ", "_").strip())}
150
157
  )
151
158
 
152
159
  return summaries
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasourcelib
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: Data source sync strategies for vector DBs
5
5
  Home-page: https://github.com/akashmaurya0217/datasourcelib
6
6
  Author: Akash Kumar Maurya
@@ -5,7 +5,7 @@ datasourcelib/core/sync_manager.py,sha256=pfnvWv4AwmlJJUIsfxNNxYDBOsa7juTIxgFJIE
5
5
  datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
6
6
  datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
7
7
  datasourcelib/datasources/azure_devops_source copy.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
8
- datasourcelib/datasources/azure_devops_source.py,sha256=o-rl090HxbBA_Sl6WHazIDoA1NhjybIrmyQCU0SwzqA,19649
8
+ datasourcelib/datasources/azure_devops_source.py,sha256=jWey-FbL5DnzVFFhFaw5QdVUv0t81ATLM3ASKMozGhc,19639
9
9
  datasourcelib/datasources/azure_devops_source10dec.py,sha256=J48E78AEfqkS-eBq7sesA48zmSiZ9oSfJkQjL7RAbyA,7928
10
10
  datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
11
11
  datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
@@ -24,14 +24,14 @@ datasourcelib/strategies/incremental_load.py,sha256=CY1tAyXwjZLoq5zMLwB5i5qmT_L8
24
24
  datasourcelib/strategies/ondemand_load.py,sha256=MgenKJbJePLeErdEkXKsz1h7RuR8yT0RV_X523G7UUs,1304
25
25
  datasourcelib/strategies/timerange_load.py,sha256=W_sSZg059Lw2o9tmdGKM9D5-z1pph7AN1ftalXhuyjo,1557
26
26
  datasourcelib/utils/__init__.py,sha256=9pSIpaK-kdmNuDzwl0Z7QU-_lV3cZE-iwOEPh3RBBTs,298
27
- datasourcelib/utils/aggregation.py,sha256=5aOBcxay4eTyY-S4BRafNgSi37AY-JXERzcCv055E8w,6060
27
+ datasourcelib/utils/aggregation.py,sha256=_XzTxdGIc-nc0w1FE1NfPA6J1PmAKiSpz0sYU7yEU6s,6337
28
28
  datasourcelib/utils/byte_reader.py,sha256=GaoPXwJa2YTWG1Kim0K6JG20eVSaWkZJd1o9bswxHmc,9082
29
29
  datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4fYQww,369
30
30
  datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
31
31
  datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
32
32
  datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
33
- datasourcelib-0.1.12.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
34
- datasourcelib-0.1.12.dist-info/METADATA,sha256=Rvu5r33TNr6s-ph4bH6MCcwOx_jELup4C3KNnmTZA8Y,1200
35
- datasourcelib-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
- datasourcelib-0.1.12.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
37
- datasourcelib-0.1.12.dist-info/RECORD,,
33
+ datasourcelib-0.1.14.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
34
+ datasourcelib-0.1.14.dist-info/METADATA,sha256=EyIP4yk74vb4-yhOWh--E9aXo8E1gt8RDVkYVoexfNI,1200
35
+ datasourcelib-0.1.14.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
36
+ datasourcelib-0.1.14.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
37
+ datasourcelib-0.1.14.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5