datasourcelib 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ from datasourcelib.utils.validators import require_keys
5
5
  import base64
6
6
  import json
7
7
  from bs4 import BeautifulSoup
8
+ import regex as re
8
9
 
9
10
  logger = get_logger(__name__)
10
11
 
@@ -17,7 +18,7 @@ class AzureDevOpsSource(DataSourceBase):
17
18
 
18
19
  def validate_config(self) -> bool:
19
20
  try:
20
- require_keys(self.config, ["ado_organization", "ado_personal_access_token","ado_project","ado_query_id"])
21
+ require_keys(self.config, ["ado_organization", "ado_personal_access_token"])
21
22
  return True
22
23
  except Exception as ex:
23
24
  logger.error("AzureDevOpsSource.validate_config: %s", ex)
@@ -35,12 +36,18 @@ class AzureDevOpsSource(DataSourceBase):
35
36
  logger.info("AzureDevOpsSource ready (no persistent connection required)")
36
37
  return True
37
38
 
39
+ @staticmethod
40
+ def sanitize(s: str) -> str:
41
+ """Keep only A-Z a-z 0-9 underscore/dash/equals in a safe way."""
42
+ # using the `regex` import already present as `re`
43
+ return re.sub(r'[^A-Za-z0-9_\-=]', '', s)
44
+
38
45
  def disconnect(self) -> None:
39
46
  self._headers = {}
40
47
  self._connected = False
41
48
  logger.info("AzureDevOpsSource cleared")
42
49
 
43
- def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
50
+ def fetch_query_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
44
51
  if requests is None:
45
52
  raise RuntimeError("requests package is required for AzureDevOpsSource")
46
53
  if not getattr(self, "_connected", False):
@@ -181,3 +188,215 @@ class AzureDevOpsSource(DataSourceBase):
181
188
  work_item_details.append(entry)
182
189
 
183
190
  return work_item_details
191
+
192
+ def fetch_wiki_data(self, wiki_name: Optional[str] = None, max_depth: int = 3, **kwargs) -> List[Dict[str, Any]]:
193
+ """
194
+ Crawl wiki pages in the configured Azure DevOps organization/project and return a list of
195
+ dicts: {"display_name": str, "url": str, "content": str, "wiki": str, "project": str}.
196
+ - wiki_name: optional filter to select a single wiki by name
197
+ - max_depth: how many child levels to traverse (>=1)
198
+ - If ado_project is configured, only fetch wikis from that project.
199
+ - Otherwise, fetch wikis from all projects in the organization.
200
+ """
201
+ if requests is None:
202
+ raise RuntimeError("requests package is required for AzureDevOpsSource")
203
+ if not getattr(self, "_connected", False):
204
+ self.connect()
205
+
206
+ org = self.config.get("ado_organization")
207
+ configured_project = self.config.get("ado_project") # Rename to avoid overwriting in loop
208
+ api_version = self.config.get("api_version", "7.1")
209
+ headers = getattr(self, "_headers", {})
210
+
211
+ results: List[Dict[str, Any]] = []
212
+ seen_paths = set()
213
+
214
+ # Determine which projects to process
215
+ projects_to_process = []
216
+ if configured_project:
217
+ # Use only the configured project
218
+ projects_to_process = [configured_project]
219
+ logger.info("fetch_wiki_data: Using configured project: %s", configured_project)
220
+ else:
221
+ # Fetch all projects in the organization
222
+ try:
223
+ projects_url = f"https://dev.azure.com/{org}/_apis/projects?api-version={api_version}"
224
+ proj_resp = requests.get(projects_url, headers=headers, timeout=30)
225
+ proj_resp.raise_for_status()
226
+ proj_json = proj_resp.json()
227
+ projects_list = proj_json.get("value", [])
228
+ projects_to_process = [p.get("name") or p.get("id") for p in projects_list if p.get("name") or p.get("id")]
229
+ logger.info("fetch_wiki_data: Found %d projects in organization", len(projects_to_process))
230
+ except Exception as ex:
231
+ logger.exception("Failed to list projects in organization: %s", ex)
232
+ return []
233
+
234
+ # Process each project
235
+ for project_name in projects_to_process:
236
+ logger.info("fetch_wiki_data: Processing project: %s", project_name)
237
+
238
+ # 1) List wikis in this project
239
+ wikis_url = f"https://dev.azure.com/{org}/{project_name}/_apis/wiki/wikis?api-version={api_version}"
240
+ try:
241
+ resp = requests.get(wikis_url, headers=headers, timeout=30)
242
+ resp.raise_for_status()
243
+ wikis_json = resp.json()
244
+ wikis = wikis_json.get("value", []) if isinstance(wikis_json, dict) else []
245
+ except Exception as ex:
246
+ logger.warning("Failed to list wikis for project %s: %s", project_name, ex)
247
+ continue
248
+
249
+ # Filter selected wikis by name if specified
250
+ selected_wikis = []
251
+ for w in wikis:
252
+ name = w.get("name") or w.get("wikiName") or ""
253
+ if wiki_name:
254
+ if name.lower() == wiki_name.lower():
255
+ selected_wikis.append(w)
256
+ else:
257
+ # Include all wikis for this project
258
+ selected_wikis.append(w)
259
+
260
+ if not selected_wikis:
261
+ logger.debug("No wikis found in project %s matching filter (wiki_name=%s)", project_name, wiki_name)
262
+ continue
263
+
264
+ # 2) Crawl pages in each wiki
265
+ for wiki in selected_wikis:
266
+ wiki_id = wiki.get("id") or wiki.get("name")
267
+ wiki_display = wiki.get("name") or wiki.get("wikiName") or str(wiki_id)
268
+ logger.info("fetch_wiki_data: Crawling wiki '%s' in project '%s'", wiki_display, project_name)
269
+
270
+ # BFS queue of (path, depth). Start at root path "/"
271
+ queue = [("/", 1)]
272
+
273
+ while queue:
274
+ path, depth = queue.pop(0)
275
+ if depth > max_depth:
276
+ continue
277
+
278
+ # Pages listing for this path with recursionLevel=1 to get direct children
279
+ pages_url = (
280
+ f"https://dev.azure.com/{org}/{project_name}/_apis/wiki/wikis/{wiki_id}/pages"
281
+ f"?path={path}&recursionLevel=1&api-version={api_version}"
282
+ )
283
+ try:
284
+ p_resp = requests.get(pages_url, headers=headers, timeout=30)
285
+ p_resp.raise_for_status()
286
+ p_json = p_resp.json()
287
+ pages = p_json.get("value") or p_json.get("subPages") or []
288
+ except Exception as ex:
289
+ logger.warning("Failed to list pages for wiki %s path %s in project %s: %s",
290
+ wiki_display, path, project_name, ex)
291
+ pages = []
292
+
293
+ for page in pages:
294
+ page_path = page.get("path") or "/"
295
+ # Dedupe by wiki id + project + path
296
+ key = f"{project_name}:{wiki_id}:{page_path}"
297
+ if key in seen_paths:
298
+ continue
299
+ seen_paths.add(key)
300
+
301
+ # Display name and url
302
+ display_name = page.get("name") or page.get("pageName") or page_path.strip("/") or "/"
303
+ new_display_name = self.sanitize(display_name.replace(" ", "_").strip()),
304
+ url = (
305
+ page.get("remoteUrl")
306
+ or page.get("url")
307
+ or (page.get("_links") or {}).get("web", {}).get("href")
308
+ or ""
309
+ )
310
+
311
+ # Fetch page content (includeContent)
312
+ content_text = ""
313
+ try:
314
+ content_url = (
315
+ f"https://dev.azure.com/{org}/{project_name}/_apis/wiki/wikis/{wiki_id}/pages"
316
+ f"?path={page_path}&includeContent=true&api-version={api_version}"
317
+ )
318
+ c_resp = requests.get(content_url, headers=headers, timeout=30)
319
+ c_resp.raise_for_status()
320
+ c_json = c_resp.json()
321
+
322
+ # Page content may be in several places depending on API version
323
+ if isinstance(c_json, dict):
324
+ # If API returns page object
325
+ content_text = (
326
+ c_json.get("content")
327
+ or (c_json.get("value", [{}])[0].get("content", "") if c_json.get("value") else "")
328
+ or c_json.get("text", "")
329
+ )
330
+ else:
331
+ # Fallback to raw bytes
332
+ content_text = c_resp.content.decode("utf-8", errors="ignore")
333
+ except Exception as fetch_ex:
334
+ logger.debug("Failed to fetch content for page %s: %s", display_name, fetch_ex)
335
+ # Best-effort fallback: try to GET the web url (may return HTML)
336
+ if url:
337
+ try:
338
+ w_resp = requests.get(url, headers=headers, timeout=30)
339
+ w_resp.raise_for_status()
340
+ content_text = w_resp.content.decode("utf-8", errors="ignore")
341
+ except Exception:
342
+ content_text = ""
343
+ # Construct a 'full' description string using available pieces
344
+ content_text = BeautifulSoup(content_text or "", "html.parser").get_text(),
345
+ parts = []
346
+ if new_display_name:
347
+ parts.append(f"Wiki Page Name is {display_name}. Page has information about {display_name}")
348
+ if project_name:
349
+ parts.append(f"This page is documented by for Project '{project_name}' and by the team '{project_name}'")
350
+ if url:
351
+ parts.append(f"The devops wiki page (url) link to access this page is {url}")
352
+ if project_name:
353
+ parts.append(f"These wiki page content refers sharepoint site links and other documents from sharepoint. So to get full detailed steps or contents you need to refer those links with appropriate permissions. This page contents are available on wiki are [{content_text}].")
354
+
355
+ index_content = ". ".join(parts)
356
+ results.append({
357
+ "display_name": new_display_name,
358
+ "url": url,
359
+ "content": index_content,
360
+ "project": project_name
361
+ })
362
+
363
+ # Enqueue child pages
364
+ if depth < max_depth:
365
+ # If page has children field, use it
366
+ children = page.get("children") or []
367
+ if children:
368
+ for ch in children:
369
+ ch_path = ch.get("path") or ch
370
+ queue.append((ch_path, depth + 1))
371
+ else:
372
+ # Fallback: attempt to list sub-path under current page path
373
+ sub_path = page_path.rstrip("/") + "/"
374
+ queue.append((sub_path, depth + 1))
375
+
376
+ logger.info("fetch_wiki_data completed: Retrieved %d wiki pages", len(results))
377
+ return results
378
+
379
+ def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
380
+ """
381
+ Dispatch fetch call to either wiki downloader or WIQL/query fetcher.
382
+
383
+ Priority:
384
+ 1. kwargs['ado_download_wiki'] if provided
385
+ 2. self.config['ado_download_wiki'] otherwise
386
+
387
+ Accepts same params as fetch_query_data / fetch_wiki_data and returns their output.
388
+ """
389
+ # Determine flag from kwargs first, then config
390
+ download_flag = kwargs.pop("ado_download_wiki", None)
391
+ if download_flag is None:
392
+ download_flag = self.config.get("ado_download_wiki", False)
393
+
394
+ # normalize boolean-like strings
395
+ if isinstance(download_flag, str):
396
+ download_flag = download_flag.strip().lower() in ("1", "true", "yes", "y", "on")
397
+
398
+ if download_flag:
399
+ # pass query as wiki_name if caller intended, otherwise kwargs forwarded
400
+ return self.fetch_wiki_data(wiki_name=query, **kwargs)
401
+ else:
402
+ return self.fetch_query_data(query=query, **kwargs)
@@ -0,0 +1,183 @@
1
+ from typing import Any, Dict, List, Optional
2
+ from datasourcelib.datasources.datasource_base import DataSourceBase
3
+ from datasourcelib.utils.logger import get_logger
4
+ from datasourcelib.utils.validators import require_keys
5
+ import base64
6
+ import json
7
+ from bs4 import BeautifulSoup
8
+
9
+ logger = get_logger(__name__)
10
+
11
+ try:
12
+ import requests # type: ignore
13
+ except Exception:
14
+ requests = None # lazy import handled at runtime
15
+
16
+ class AzureDevOpsSource(DataSourceBase):
17
+
18
+ def validate_config(self) -> bool:
19
+ try:
20
+ require_keys(self.config, ["ado_organization", "ado_personal_access_token","ado_project","ado_query_id"])
21
+ return True
22
+ except Exception as ex:
23
+ logger.error("AzureDevOpsSource.validate_config: %s", ex)
24
+ return False
25
+
26
+ def connect(self) -> bool:
27
+ if requests is None:
28
+ raise RuntimeError("requests package is required for AzureDevOpsSource")
29
+ # No persistent connection; store auth header
30
+ pat = self.config.get("ado_personal_access_token")
31
+ token = pat
32
+ token_b64 = base64.b64encode(token.encode("utf-8")).decode("utf-8")
33
+ self._headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
34
+ self._connected = True
35
+ logger.info("AzureDevOpsSource ready (no persistent connection required)")
36
+ return True
37
+
38
+ def disconnect(self) -> None:
39
+ self._headers = {}
40
+ self._connected = False
41
+ logger.info("AzureDevOpsSource cleared")
42
+
43
+ def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
44
+ if requests is None:
45
+ raise RuntimeError("requests package is required for AzureDevOpsSource")
46
+ if not getattr(self, "_connected", False):
47
+ self.connect()
48
+
49
+ org = self.config.get("ado_organization")
50
+ project = self.config.get("ado_project")
51
+ query_id = self.config.get("ado_query_id")
52
+ api_version = self.config.get("api_version", "7.1")
53
+ if not query_id:
54
+ raise ValueError("AzureDevOpsSource.fetch_data requires 'query_id' or query argument")
55
+
56
+ base = f"https://dev.azure.com/{org}/"
57
+ if project:
58
+ base = f"{base}{project}/"
59
+ # WIQL query by id (returns list of work item refs)
60
+ wiql_url = f"{base}_apis/wit/wiql/{query_id}"
61
+ params = {"api-version": api_version}
62
+ method = self.config.get("method", "GET").upper()
63
+ query_response = requests.request(method, wiql_url, headers=getattr(self, "_headers", {}), params=params)
64
+ query_response.raise_for_status()
65
+
66
+ if query_response.status_code != 200:
67
+ raise RuntimeError(f"Error: {query_response.status_code}")
68
+
69
+ work_items_refs = query_response.json().get('workItems', []) or []
70
+ if not work_items_refs:
71
+ return []
72
+
73
+ # collect ids and fetch details in batch to get all fields for all work item types
74
+ ids = [str(item.get('id')) for item in work_items_refs if item.get('id')]
75
+ if not ids:
76
+ return []
77
+
78
+ details_url = f"https://dev.azure.com/{org}/{project}/_apis/wit/workitems"
79
+ # expand=all to include fields, relations, and attachments
80
+ params = {
81
+ "ids": ",".join(ids),
82
+ "api-version": api_version,
83
+ "$expand": "all"
84
+ }
85
+ details_resp = requests.get(details_url, headers=getattr(self, "_headers", {}), params=params)
86
+ details_resp.raise_for_status()
87
+ items = details_resp.json().get("value", [])
88
+
89
+ work_item_details: List[Dict[str, Any]] = []
90
+ for item in items:
91
+ item_id = item.get("id")
92
+ fields = item.get("fields", {}) or {}
93
+
94
+ # Normalize field keys to safe snake_case-like keys
95
+ norm_fields: Dict[str, Any] = {}
96
+ for k, v in fields.items():
97
+ nk = k.replace(".", "_")
98
+ nk = nk.lower()
99
+ norm_fields[nk] = v
100
+
101
+ # Helper to safely extract nested displayName for assigned to
102
+ assigned = norm_fields.get("system_assignedto")
103
+ if isinstance(assigned, dict):
104
+ assigned_to = assigned.get("displayName") or assigned.get("uniqueName") or str(assigned)
105
+ else:
106
+ assigned_to = assigned
107
+
108
+ # find a description-like field (some types use different field names)
109
+ desc = ""
110
+ for fk in ["system_description", "microsoft_vsts_createdby", "html_description"]:
111
+ if fk in norm_fields:
112
+ desc = norm_fields.get(fk) or ""
113
+ break
114
+ if not desc:
115
+ # fallback: first field key that contains 'description'
116
+ for kf, vf in norm_fields.items():
117
+ if "description" in kf and vf:
118
+ desc = vf
119
+ break
120
+
121
+ # clean HTML description to text
122
+ try:
123
+ c_desc = BeautifulSoup(desc or "", "html.parser").get_text()
124
+ except Exception:
125
+ c_desc = desc or ""
126
+
127
+ # Build common convenience values (use available fields)
128
+ wi_type = norm_fields.get("system_workitemtype") or norm_fields.get("system_witype") or ""
129
+ title = norm_fields.get("system_title") or ""
130
+ status = norm_fields.get("system_state") or ""
131
+ created = norm_fields.get("system_createddate") or norm_fields.get("system_created") or ""
132
+ changed = norm_fields.get("system_changeddate") or norm_fields.get("system_changed") or ""
133
+ tags = norm_fields.get("system_tags", "")
134
+ project_name = norm_fields.get("custom.projectname") or norm_fields.get("system_teamproject") or ""
135
+
136
+ rtype = norm_fields.get("custom.releasetype") or norm_fields.get("custom_releasetype") or ""
137
+ target_date = norm_fields.get("microsoft_vsts_scheduling_targetdate") or norm_fields.get("microsoft.vsts.scheduling.targetdate") or ""
138
+
139
+ # Construct a 'full' description string using available pieces
140
+ parts = []
141
+ if wi_type:
142
+ parts.append(f"{wi_type} ID {item_id}")
143
+ else:
144
+ parts.append(f"WorkItem {item_id}")
145
+ if created:
146
+ parts.append(f"was created on {created}")
147
+ if title:
148
+ parts.append(f"and has Title '{title}'")
149
+ if status:
150
+ parts.append(f"is currently in {status} state")
151
+ if assigned_to:
152
+ parts.append(f"is assigned to {assigned_to}")
153
+ if project_name:
154
+ parts.append(f"for Project '{project_name}'")
155
+ if rtype:
156
+ parts.append(f"release type '{rtype}'")
157
+ if target_date:
158
+ parts.append(f"with target date '{target_date}'")
159
+ if tags:
160
+ parts.append(f"Tags: {tags}")
161
+ if c_desc:
162
+ parts.append(f"Description: [{c_desc}]")
163
+ fullfeature = ". ".join(parts)
164
+
165
+ # include all normalized fields in the returned object for completeness
166
+ entry = {
167
+ "id": item_id,
168
+ "type": wi_type,
169
+ "title": title,
170
+ "status": status,
171
+ "assigned_to": assigned_to,
172
+ "created": created,
173
+ "changed_date": changed,
174
+ "tags": tags,
175
+ "project": project_name,
176
+ "release_type": rtype,
177
+ "target_date": target_date,
178
+ "description": c_desc,
179
+ "full": fullfeature
180
+ }
181
+ work_item_details.append(entry)
182
+
183
+ return work_item_details
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasourcelib
3
- Version: 0.1.10
3
+ Version: 0.1.12
4
4
  Summary: Data source sync strategies for vector DBs
5
5
  Home-page: https://github.com/akashmaurya0217/datasourcelib
6
6
  Author: Akash Kumar Maurya
@@ -5,7 +5,8 @@ datasourcelib/core/sync_manager.py,sha256=pfnvWv4AwmlJJUIsfxNNxYDBOsa7juTIxgFJIE
5
5
  datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
6
6
  datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
7
7
  datasourcelib/datasources/azure_devops_source copy.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
8
- datasourcelib/datasources/azure_devops_source.py,sha256=J48E78AEfqkS-eBq7sesA48zmSiZ9oSfJkQjL7RAbyA,7928
8
+ datasourcelib/datasources/azure_devops_source.py,sha256=o-rl090HxbBA_Sl6WHazIDoA1NhjybIrmyQCU0SwzqA,19649
9
+ datasourcelib/datasources/azure_devops_source10dec.py,sha256=J48E78AEfqkS-eBq7sesA48zmSiZ9oSfJkQjL7RAbyA,7928
9
10
  datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
10
11
  datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
11
12
  datasourcelib/datasources/datasource_types.py,sha256=jpm4f9n1l7X9aBD58Pbr9evXiCHHEhRCLojGwchUD7A,205
@@ -29,8 +30,8 @@ datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4f
29
30
  datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
30
31
  datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
31
32
  datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
32
- datasourcelib-0.1.10.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
33
- datasourcelib-0.1.10.dist-info/METADATA,sha256=d3oHE59WsVQ52-wWztDEKwcwg51lUIVHTIGrss9HP7E,1200
34
- datasourcelib-0.1.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
- datasourcelib-0.1.10.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
36
- datasourcelib-0.1.10.dist-info/RECORD,,
33
+ datasourcelib-0.1.12.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
34
+ datasourcelib-0.1.12.dist-info/METADATA,sha256=Rvu5r33TNr6s-ph4bH6MCcwOx_jELup4C3KNnmTZA8Y,1200
35
+ datasourcelib-0.1.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
+ datasourcelib-0.1.12.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
37
+ datasourcelib-0.1.12.dist-info/RECORD,,