datasourcelib 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,7 @@ from datasourcelib.utils.validators import require_keys
5
5
  import base64
6
6
  import json
7
7
  from bs4 import BeautifulSoup
8
+ import regex as re
8
9
 
9
10
  logger = get_logger(__name__)
10
11
 
@@ -17,7 +18,7 @@ class AzureDevOpsSource(DataSourceBase):
17
18
 
18
19
  def validate_config(self) -> bool:
19
20
  try:
20
- require_keys(self.config, ["ado_organization", "ado_personal_access_token","ado_project","ado_query_id"])
21
+ require_keys(self.config, ["ado_organization", "ado_personal_access_token"])
21
22
  return True
22
23
  except Exception as ex:
23
24
  logger.error("AzureDevOpsSource.validate_config: %s", ex)
@@ -35,12 +36,18 @@ class AzureDevOpsSource(DataSourceBase):
35
36
  logger.info("AzureDevOpsSource ready (no persistent connection required)")
36
37
  return True
37
38
 
39
+ @staticmethod
40
+ def sanitize(s: str) -> str:
41
+ """Keep only A-Z a-z 0-9 underscore/dash/equals in a safe way."""
42
+ # using the `regex` import already present as `re`
43
+ return re.sub(r'[^A-Za-z0-9_\-=]', '', s)
44
+
38
45
  def disconnect(self) -> None:
39
46
  self._headers = {}
40
47
  self._connected = False
41
48
  logger.info("AzureDevOpsSource cleared")
42
49
 
43
- def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
50
+ def fetch_query_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
44
51
  if requests is None:
45
52
  raise RuntimeError("requests package is required for AzureDevOpsSource")
46
53
  if not getattr(self, "_connected", False):
@@ -176,9 +183,208 @@ class AzureDevOpsSource(DataSourceBase):
176
183
  "release_type": rtype,
177
184
  "target_date": target_date,
178
185
  "description": c_desc,
179
- "full": fullfeature,
180
- "fields": norm_fields # full field set for this work item
186
+ "full": fullfeature
181
187
  }
182
188
  work_item_details.append(entry)
183
189
 
184
190
  return work_item_details
191
+
192
+ def fetch_wiki_data(self, wiki_name: Optional[str] = None, max_depth: int = 3, **kwargs) -> List[Dict[str, Any]]:
193
+ """
194
+ Crawl wiki pages in the configured Azure DevOps organization/project and return a list of
195
+ dicts: {"display_name": str, "url": str, "content": str, "wiki": str, "project": str}.
196
+ - wiki_name: optional filter to select a single wiki by name
197
+ - max_depth: how many child levels to traverse (>=1)
198
+ - If ado_project is configured, only fetch wikis from that project.
199
+ - Otherwise, fetch wikis from all projects in the organization.
200
+ """
201
+ if requests is None:
202
+ raise RuntimeError("requests package is required for AzureDevOpsSource")
203
+ if not getattr(self, "_connected", False):
204
+ self.connect()
205
+
206
+ org = self.config.get("ado_organization")
207
+ configured_project = self.config.get("ado_project") # Rename to avoid overwriting in loop
208
+ api_version = self.config.get("api_version", "7.1")
209
+ headers = getattr(self, "_headers", {})
210
+
211
+ results: List[Dict[str, Any]] = []
212
+ seen_paths = set()
213
+
214
+ # Determine which projects to process
215
+ projects_to_process = []
216
+ if configured_project:
217
+ # Use only the configured project
218
+ projects_to_process = [configured_project]
219
+ logger.info("fetch_wiki_data: Using configured project: %s", configured_project)
220
+ else:
221
+ # Fetch all projects in the organization
222
+ try:
223
+ projects_url = f"https://dev.azure.com/{org}/_apis/projects?api-version={api_version}"
224
+ proj_resp = requests.get(projects_url, headers=headers, timeout=30)
225
+ proj_resp.raise_for_status()
226
+ proj_json = proj_resp.json()
227
+ projects_list = proj_json.get("value", [])
228
+ projects_to_process = [p.get("name") or p.get("id") for p in projects_list if p.get("name") or p.get("id")]
229
+ logger.info("fetch_wiki_data: Found %d projects in organization", len(projects_to_process))
230
+ except Exception as ex:
231
+ logger.exception("Failed to list projects in organization: %s", ex)
232
+ return []
233
+
234
+ # Process each project
235
+ for project_name in projects_to_process:
236
+ logger.info("fetch_wiki_data: Processing project: %s", project_name)
237
+
238
+ # 1) List wikis in this project
239
+ wikis_url = f"https://dev.azure.com/{org}/{project_name}/_apis/wiki/wikis?api-version={api_version}"
240
+ try:
241
+ resp = requests.get(wikis_url, headers=headers, timeout=30)
242
+ resp.raise_for_status()
243
+ wikis_json = resp.json()
244
+ wikis = wikis_json.get("value", []) if isinstance(wikis_json, dict) else []
245
+ except Exception as ex:
246
+ logger.warning("Failed to list wikis for project %s: %s", project_name, ex)
247
+ continue
248
+
249
+ # Filter selected wikis by name if specified
250
+ selected_wikis = []
251
+ for w in wikis:
252
+ name = w.get("name") or w.get("wikiName") or ""
253
+ if wiki_name:
254
+ if name.lower() == wiki_name.lower():
255
+ selected_wikis.append(w)
256
+ else:
257
+ # Include all wikis for this project
258
+ selected_wikis.append(w)
259
+
260
+ if not selected_wikis:
261
+ logger.debug("No wikis found in project %s matching filter (wiki_name=%s)", project_name, wiki_name)
262
+ continue
263
+
264
+ # 2) Crawl pages in each wiki
265
+ for wiki in selected_wikis:
266
+ wiki_id = wiki.get("id") or wiki.get("name")
267
+ wiki_display = wiki.get("name") or wiki.get("wikiName") or str(wiki_id)
268
+ logger.info("fetch_wiki_data: Crawling wiki '%s' in project '%s'", wiki_display, project_name)
269
+
270
+ # BFS queue of (path, depth). Start at root path "/"
271
+ queue = [("/", 1)]
272
+
273
+ while queue:
274
+ path, depth = queue.pop(0)
275
+ if depth > max_depth:
276
+ continue
277
+
278
+ # Pages listing for this path with recursionLevel=1 to get direct children
279
+ pages_url = (
280
+ f"https://dev.azure.com/{org}/{project_name}/_apis/wiki/wikis/{wiki_id}/pages"
281
+ f"?path={path}&recursionLevel=1&api-version={api_version}"
282
+ )
283
+ try:
284
+ p_resp = requests.get(pages_url, headers=headers, timeout=30)
285
+ p_resp.raise_for_status()
286
+ p_json = p_resp.json()
287
+ pages = p_json.get("value") or p_json.get("subPages") or []
288
+ except Exception as ex:
289
+ logger.warning("Failed to list pages for wiki %s path %s in project %s: %s",
290
+ wiki_display, path, project_name, ex)
291
+ pages = []
292
+
293
+ for page in pages:
294
+ page_path = page.get("path") or "/"
295
+ # Dedupe by wiki id + project + path
296
+ key = f"{project_name}:{wiki_id}:{page_path}"
297
+ if key in seen_paths:
298
+ continue
299
+ seen_paths.add(key)
300
+
301
+ # Display name and url
302
+ display_name = page.get("name") or page.get("pageName") or page_path.strip("/") or "/"
303
+ url = (
304
+ page.get("remoteUrl")
305
+ or page.get("url")
306
+ or (page.get("_links") or {}).get("web", {}).get("href")
307
+ or ""
308
+ )
309
+
310
+ # Fetch page content (includeContent)
311
+ content_text = ""
312
+ try:
313
+ content_url = (
314
+ f"https://dev.azure.com/{org}/{project_name}/_apis/wiki/wikis/{wiki_id}/pages"
315
+ f"?path={page_path}&includeContent=true&api-version={api_version}"
316
+ )
317
+ c_resp = requests.get(content_url, headers=headers, timeout=30)
318
+ c_resp.raise_for_status()
319
+ c_json = c_resp.json()
320
+
321
+ # Page content may be in several places depending on API version
322
+ if isinstance(c_json, dict):
323
+ # If API returns page object
324
+ content_text = (
325
+ c_json.get("content")
326
+ or (c_json.get("value", [{}])[0].get("content", "") if c_json.get("value") else "")
327
+ or c_json.get("text", "")
328
+ )
329
+ else:
330
+ # Fallback to raw bytes
331
+ content_text = c_resp.content.decode("utf-8", errors="ignore")
332
+ except Exception as fetch_ex:
333
+ logger.debug("Failed to fetch content for page %s: %s", display_name, fetch_ex)
334
+ # Best-effort fallback: try to GET the web url (may return HTML)
335
+ if url:
336
+ try:
337
+ w_resp = requests.get(url, headers=headers, timeout=30)
338
+ w_resp.raise_for_status()
339
+ content_text = w_resp.content.decode("utf-8", errors="ignore")
340
+ except Exception:
341
+ content_text = ""
342
+
343
+ results.append({
344
+ "display_name": self.sanitize(display_name.replace(" ", "_").strip()),
345
+ "url": url,
346
+ "content": BeautifulSoup(content_text or "", "html.parser").get_text(),
347
+ "wiki": wiki_display,
348
+ "project": project_name
349
+ })
350
+
351
+ # Enqueue child pages
352
+ if depth < max_depth:
353
+ # If page has children field, use it
354
+ children = page.get("children") or []
355
+ if children:
356
+ for ch in children:
357
+ ch_path = ch.get("path") or ch
358
+ queue.append((ch_path, depth + 1))
359
+ else:
360
+ # Fallback: attempt to list sub-path under current page path
361
+ sub_path = page_path.rstrip("/") + "/"
362
+ queue.append((sub_path, depth + 1))
363
+
364
+ logger.info("fetch_wiki_data completed: Retrieved %d wiki pages", len(results))
365
+ return results
366
+
367
+ def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
368
+ """
369
+ Dispatch fetch call to either wiki downloader or WIQL/query fetcher.
370
+
371
+ Priority:
372
+ 1. kwargs['ado_download_wiki'] if provided
373
+ 2. self.config['ado_download_wiki'] otherwise
374
+
375
+ Accepts same params as fetch_query_data / fetch_wiki_data and returns their output.
376
+ """
377
+ # Determine flag from kwargs first, then config
378
+ download_flag = kwargs.pop("ado_download_wiki", None)
379
+ if download_flag is None:
380
+ download_flag = self.config.get("ado_download_wiki", False)
381
+
382
+ # normalize boolean-like strings
383
+ if isinstance(download_flag, str):
384
+ download_flag = download_flag.strip().lower() in ("1", "true", "yes", "y", "on")
385
+
386
+ if download_flag:
387
+ # pass query as wiki_name if caller intended, otherwise kwargs forwarded
388
+ return self.fetch_wiki_data(wiki_name=query, **kwargs)
389
+ else:
390
+ return self.fetch_query_data(query=query, **kwargs)
@@ -0,0 +1,183 @@
1
+ from typing import Any, Dict, List, Optional
2
+ from datasourcelib.datasources.datasource_base import DataSourceBase
3
+ from datasourcelib.utils.logger import get_logger
4
+ from datasourcelib.utils.validators import require_keys
5
+ import base64
6
+ import json
7
+ from bs4 import BeautifulSoup
8
+
9
+ logger = get_logger(__name__)
10
+
11
+ try:
12
+ import requests # type: ignore
13
+ except Exception:
14
+ requests = None # lazy import handled at runtime
15
+
16
+ class AzureDevOpsSource(DataSourceBase):
17
+
18
+ def validate_config(self) -> bool:
19
+ try:
20
+ require_keys(self.config, ["ado_organization", "ado_personal_access_token","ado_project","ado_query_id"])
21
+ return True
22
+ except Exception as ex:
23
+ logger.error("AzureDevOpsSource.validate_config: %s", ex)
24
+ return False
25
+
26
+ def connect(self) -> bool:
27
+ if requests is None:
28
+ raise RuntimeError("requests package is required for AzureDevOpsSource")
29
+ # No persistent connection; store auth header
30
+ pat = self.config.get("ado_personal_access_token")
31
+ token = pat
32
+ token_b64 = base64.b64encode(token.encode("utf-8")).decode("utf-8")
33
+ self._headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
34
+ self._connected = True
35
+ logger.info("AzureDevOpsSource ready (no persistent connection required)")
36
+ return True
37
+
38
+ def disconnect(self) -> None:
39
+ self._headers = {}
40
+ self._connected = False
41
+ logger.info("AzureDevOpsSource cleared")
42
+
43
+ def fetch_data(self, query: Optional[str] = None, **kwargs) -> List[Dict[str, Any]]:
44
+ if requests is None:
45
+ raise RuntimeError("requests package is required for AzureDevOpsSource")
46
+ if not getattr(self, "_connected", False):
47
+ self.connect()
48
+
49
+ org = self.config.get("ado_organization")
50
+ project = self.config.get("ado_project")
51
+ query_id = self.config.get("ado_query_id")
52
+ api_version = self.config.get("api_version", "7.1")
53
+ if not query_id:
54
+ raise ValueError("AzureDevOpsSource.fetch_data requires 'query_id' or query argument")
55
+
56
+ base = f"https://dev.azure.com/{org}/"
57
+ if project:
58
+ base = f"{base}{project}/"
59
+ # WIQL query by id (returns list of work item refs)
60
+ wiql_url = f"{base}_apis/wit/wiql/{query_id}"
61
+ params = {"api-version": api_version}
62
+ method = self.config.get("method", "GET").upper()
63
+ query_response = requests.request(method, wiql_url, headers=getattr(self, "_headers", {}), params=params)
64
+ query_response.raise_for_status()
65
+
66
+ if query_response.status_code != 200:
67
+ raise RuntimeError(f"Error: {query_response.status_code}")
68
+
69
+ work_items_refs = query_response.json().get('workItems', []) or []
70
+ if not work_items_refs:
71
+ return []
72
+
73
+ # collect ids and fetch details in batch to get all fields for all work item types
74
+ ids = [str(item.get('id')) for item in work_items_refs if item.get('id')]
75
+ if not ids:
76
+ return []
77
+
78
+ details_url = f"https://dev.azure.com/{org}/{project}/_apis/wit/workitems"
79
+ # expand=all to include fields, relations, and attachments
80
+ params = {
81
+ "ids": ",".join(ids),
82
+ "api-version": api_version,
83
+ "$expand": "all"
84
+ }
85
+ details_resp = requests.get(details_url, headers=getattr(self, "_headers", {}), params=params)
86
+ details_resp.raise_for_status()
87
+ items = details_resp.json().get("value", [])
88
+
89
+ work_item_details: List[Dict[str, Any]] = []
90
+ for item in items:
91
+ item_id = item.get("id")
92
+ fields = item.get("fields", {}) or {}
93
+
94
+ # Normalize field keys to safe snake_case-like keys
95
+ norm_fields: Dict[str, Any] = {}
96
+ for k, v in fields.items():
97
+ nk = k.replace(".", "_")
98
+ nk = nk.lower()
99
+ norm_fields[nk] = v
100
+
101
+ # Helper to safely extract nested displayName for assigned to
102
+ assigned = norm_fields.get("system_assignedto")
103
+ if isinstance(assigned, dict):
104
+ assigned_to = assigned.get("displayName") or assigned.get("uniqueName") or str(assigned)
105
+ else:
106
+ assigned_to = assigned
107
+
108
+ # find a description-like field (some types use different field names)
109
+ desc = ""
110
+ for fk in ["system_description", "microsoft_vsts_createdby", "html_description"]:
111
+ if fk in norm_fields:
112
+ desc = norm_fields.get(fk) or ""
113
+ break
114
+ if not desc:
115
+ # fallback: first field key that contains 'description'
116
+ for kf, vf in norm_fields.items():
117
+ if "description" in kf and vf:
118
+ desc = vf
119
+ break
120
+
121
+ # clean HTML description to text
122
+ try:
123
+ c_desc = BeautifulSoup(desc or "", "html.parser").get_text()
124
+ except Exception:
125
+ c_desc = desc or ""
126
+
127
+ # Build common convenience values (use available fields)
128
+ wi_type = norm_fields.get("system_workitemtype") or norm_fields.get("system_witype") or ""
129
+ title = norm_fields.get("system_title") or ""
130
+ status = norm_fields.get("system_state") or ""
131
+ created = norm_fields.get("system_createddate") or norm_fields.get("system_created") or ""
132
+ changed = norm_fields.get("system_changeddate") or norm_fields.get("system_changed") or ""
133
+ tags = norm_fields.get("system_tags", "")
134
+ project_name = norm_fields.get("custom.projectname") or norm_fields.get("system_teamproject") or ""
135
+
136
+ rtype = norm_fields.get("custom.releasetype") or norm_fields.get("custom_releasetype") or ""
137
+ target_date = norm_fields.get("microsoft_vsts_scheduling_targetdate") or norm_fields.get("microsoft.vsts.scheduling.targetdate") or ""
138
+
139
+ # Construct a 'full' description string using available pieces
140
+ parts = []
141
+ if wi_type:
142
+ parts.append(f"{wi_type} ID {item_id}")
143
+ else:
144
+ parts.append(f"WorkItem {item_id}")
145
+ if created:
146
+ parts.append(f"was created on {created}")
147
+ if title:
148
+ parts.append(f"and has Title '{title}'")
149
+ if status:
150
+ parts.append(f"is currently in {status} state")
151
+ if assigned_to:
152
+ parts.append(f"is assigned to {assigned_to}")
153
+ if project_name:
154
+ parts.append(f"for Project '{project_name}'")
155
+ if rtype:
156
+ parts.append(f"release type '{rtype}'")
157
+ if target_date:
158
+ parts.append(f"with target date '{target_date}'")
159
+ if tags:
160
+ parts.append(f"Tags: {tags}")
161
+ if c_desc:
162
+ parts.append(f"Description: [{c_desc}]")
163
+ fullfeature = ". ".join(parts)
164
+
165
+ # include all normalized fields in the returned object for completeness
166
+ entry = {
167
+ "id": item_id,
168
+ "type": wi_type,
169
+ "title": title,
170
+ "status": status,
171
+ "assigned_to": assigned_to,
172
+ "created": created,
173
+ "changed_date": changed,
174
+ "tags": tags,
175
+ "project": project_name,
176
+ "release_type": rtype,
177
+ "target_date": target_date,
178
+ "description": c_desc,
179
+ "full": fullfeature
180
+ }
181
+ work_item_details.append(entry)
182
+
183
+ return work_item_details
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datasourcelib
3
- Version: 0.1.9
3
+ Version: 0.1.11
4
4
  Summary: Data source sync strategies for vector DBs
5
5
  Home-page: https://github.com/akashmaurya0217/datasourcelib
6
6
  Author: Akash Kumar Maurya
@@ -5,7 +5,8 @@ datasourcelib/core/sync_manager.py,sha256=pfnvWv4AwmlJJUIsfxNNxYDBOsa7juTIxgFJIE
5
5
  datasourcelib/core/sync_types.py,sha256=KVZB7PkfkFTzghoe--U8jLeAU8XAfba9qMRIVcUjuMc,297
6
6
  datasourcelib/datasources/__init__.py,sha256=lZtgs0vT-2gub5UZo8BUnREZl3K_-_xYqUP8mjf8vhM,436
7
7
  datasourcelib/datasources/azure_devops_source copy.py,sha256=g-IOCq5vGwwteU21jZPWW_GggMu1_myVJkP0_BmSdGY,7282
8
- datasourcelib/datasources/azure_devops_source.py,sha256=3hyZIrUdgwZEQNjb2iZGDMJcAw3Z6r7oV0hWAq_zMsg,8005
8
+ datasourcelib/datasources/azure_devops_source.py,sha256=A1RhV0uy-6wJ0_HGTf9LhIafkxoFou3HO-r9HDmEAnY,18571
9
+ datasourcelib/datasources/azure_devops_source10dec.py,sha256=J48E78AEfqkS-eBq7sesA48zmSiZ9oSfJkQjL7RAbyA,7928
9
10
  datasourcelib/datasources/blob_source.py,sha256=Qk61_ulqUSPYDaiMzqgvJAu43c4AjTlDRdfFg4VwgDU,3574
10
11
  datasourcelib/datasources/datasource_base.py,sha256=N8fOGvTl8oWWAiydLI0Joz66luq73a5yovO0XA9Q3jk,1068
11
12
  datasourcelib/datasources/datasource_types.py,sha256=jpm4f9n1l7X9aBD58Pbr9evXiCHHEhRCLojGwchUD7A,205
@@ -29,8 +30,8 @@ datasourcelib/utils/exceptions.py,sha256=mgcDaW1k3VndgpMOwSm7NqgyRTvvE2a5ehn3x4f
29
30
  datasourcelib/utils/file_reader.py,sha256=Zr0rwNTRWE6KeVJEXgTOPS1_JI74LiUSiX5-6qojmN0,7301
30
31
  datasourcelib/utils/logger.py,sha256=Sl6lNlvubxtK9ztzyq7vjGVyA8_-pZ_ixpk5jfVsh6U,424
31
32
  datasourcelib/utils/validators.py,sha256=fLgmRAb5OZSdMVlHu_n0RKJUDl-G8dI8JsRSfxIquh8,205
32
- datasourcelib-0.1.9.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
33
- datasourcelib-0.1.9.dist-info/METADATA,sha256=e5FeHitCJ3JZaYoBST4sE4awseM3Sl7-kBVTAwVEXfk,1199
34
- datasourcelib-0.1.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
35
- datasourcelib-0.1.9.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
36
- datasourcelib-0.1.9.dist-info/RECORD,,
33
+ datasourcelib-0.1.11.dist-info/licenses/LICENSE,sha256=9S0AcKETmp9XOcC73jEjN7WSkuSWGFGreiBat6ONClo,1087
34
+ datasourcelib-0.1.11.dist-info/METADATA,sha256=lfafhWbmV2lNtpSFntgM62Q7TJsyl_atJe4HhCvjdKo,1200
35
+ datasourcelib-0.1.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
+ datasourcelib-0.1.11.dist-info/top_level.txt,sha256=wIwiwdIj8T9pAvE2TkGLUvT2oIi43C2vkkTKibUlv3U,14
37
+ datasourcelib-0.1.11.dist-info/RECORD,,