datasourcelib 0.1.12__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/PKG-INFO +1 -1
  2. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/setup.py +1 -1
  3. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/azure_devops_source.py +3 -3
  4. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/utils/aggregation.py +8 -1
  5. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib.egg-info/PKG-INFO +1 -1
  6. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/LICENSE +0 -0
  7. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/MANIFEST.in +0 -0
  8. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/README.md +0 -0
  9. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/pyproject.toml +0 -0
  10. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/setup.cfg +0 -0
  11. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/__init__.py +0 -0
  12. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/core/__init__.py +0 -0
  13. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/core/sync_base.py +0 -0
  14. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/core/sync_manager.py +0 -0
  15. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/core/sync_types.py +0 -0
  16. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/__init__.py +0 -0
  17. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/azure_devops_source copy.py +0 -0
  18. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/azure_devops_source10dec.py +0 -0
  19. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/blob_source.py +0 -0
  20. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/datasource_base.py +0 -0
  21. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/datasource_types.py +0 -0
  22. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/dataverse_source.py +0 -0
  23. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/sharepoint_source - Copy.py +0 -0
  24. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/sharepoint_source.py +0 -0
  25. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/sql_source.py +0 -0
  26. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/datasources/sql_source_bkup.py +0 -0
  27. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/indexes/__init__.py +0 -0
  28. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/indexes/azure_search_index.py +0 -0
  29. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/strategies/__init__.py +0 -0
  30. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/strategies/daily_load.py +0 -0
  31. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/strategies/full_load.py +0 -0
  32. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/strategies/incremental_load.py +0 -0
  33. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/strategies/ondemand_load.py +0 -0
  34. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/strategies/timerange_load.py +0 -0
  35. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/utils/__init__.py +0 -0
  36. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/utils/byte_reader.py +0 -0
  37. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/utils/exceptions.py +0 -0
  38. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/utils/file_reader.py +0 -0
  39. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/utils/logger.py +0 -0
  40. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib/utils/validators.py +0 -0
  41. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib.egg-info/SOURCES.txt +0 -0
  42. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib.egg-info/dependency_links.txt +0 -0
  43. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib.egg-info/requires.txt +0 -0
  44. {datasourcelib-0.1.12 → datasourcelib-0.1.14}/src/datasourcelib.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasourcelib
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: Data source sync strategies for vector DBs
5
5
  Home-page: https://github.com/akashmaurya0217/datasourcelib
6
6
  Author: Akash Kumar Maurya
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="datasourcelib",
5
- version="0.1.12",
5
+ version="0.1.14",
6
6
  packages=find_packages(where="src", exclude=["tests.*", "tests", "examples.*", "examples"]),
7
7
  package_dir={"": "src"},
8
8
  install_requires=[
@@ -300,7 +300,6 @@ class AzureDevOpsSource(DataSourceBase):
300
300
 
301
301
  # Display name and url
302
302
  display_name = page.get("name") or page.get("pageName") or page_path.strip("/") or "/"
303
- new_display_name = self.sanitize(display_name.replace(" ", "_").strip()),
304
303
  url = (
305
304
  page.get("remoteUrl")
306
305
  or page.get("url")
@@ -343,7 +342,7 @@ class AzureDevOpsSource(DataSourceBase):
343
342
  # Construct a 'full' description string using available pieces
344
343
  content_text = BeautifulSoup(content_text or "", "html.parser").get_text(),
345
344
  parts = []
346
- if new_display_name:
345
+ if display_name:
347
346
  parts.append(f"Wiki Page Name is {display_name}. Page has information about {display_name}")
348
347
  if project_name:
349
348
  parts.append(f"This page is documented by for Project '{project_name}' and by the team '{project_name}'")
@@ -354,7 +353,8 @@ class AzureDevOpsSource(DataSourceBase):
354
353
 
355
354
  index_content = ". ".join(parts)
356
355
  results.append({
357
- "display_name": new_display_name,
356
+ "display_name": self.sanitize(display_name.replace(" ", "_").strip()),
357
+ "page_name": display_name,
358
358
  "url": url,
359
359
  "content": index_content,
360
360
  "project": project_name
@@ -2,6 +2,7 @@
2
2
  import pandas as pd
3
3
  from string import Formatter
4
4
  from typing import Iterable, Any, Dict, List, Optional, Union
5
+ import regex as re
5
6
 
6
7
  def _placeholders(fmt: str) -> List[str]:
7
8
  """
@@ -12,6 +13,12 @@ def _placeholders(fmt: str) -> List[str]:
12
13
  def _safe_str(x) -> str:
13
14
  return "" if pd.isna(x) else str(x).strip()
14
15
 
16
+ @staticmethod
17
+ def sanitize(s: str) -> str:
18
+ """Keep only A-Z a-z 0-9 underscore/dash/equals in a safe way."""
19
+ # using the `regex` import already present as `re`
20
+ return re.sub(r'[^A-Za-z0-9_\-=]', '', s)
21
+
15
22
  def generate_grouped_summaries(
16
23
  df: pd.DataFrame,
17
24
  aggregation_field: str,
@@ -146,7 +153,7 @@ def generate_grouped_summaries(
146
153
 
147
154
  content = header + " " + " ".join(lines)
148
155
  summaries.append(
149
- {"content" : content, "id": group_value}
156
+ {"content" : content, "id": sanitize(group_value.replace(" ", "_").strip())}
150
157
  )
151
158
 
152
159
  return summaries
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasourcelib
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: Data source sync strategies for vector DBs
5
5
  Home-page: https://github.com/akashmaurya0217/datasourcelib
6
6
  Author: Akash Kumar Maurya
File without changes
File without changes
File without changes