langchain-wtmapi 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_wtmapi-0.1.0/LICENSE +21 -0
- langchain_wtmapi-0.1.0/PKG-INFO +130 -0
- langchain_wtmapi-0.1.0/README.md +104 -0
- langchain_wtmapi-0.1.0/langchain_wtmapi/__init__.py +3 -0
- langchain_wtmapi-0.1.0/langchain_wtmapi/document_loader.py +138 -0
- langchain_wtmapi-0.1.0/langchain_wtmapi.egg-info/PKG-INFO +130 -0
- langchain_wtmapi-0.1.0/langchain_wtmapi.egg-info/SOURCES.txt +10 -0
- langchain_wtmapi-0.1.0/langchain_wtmapi.egg-info/dependency_links.txt +1 -0
- langchain_wtmapi-0.1.0/langchain_wtmapi.egg-info/requires.txt +1 -0
- langchain_wtmapi-0.1.0/langchain_wtmapi.egg-info/top_level.txt +1 -0
- langchain_wtmapi-0.1.0/pyproject.toml +35 -0
- langchain_wtmapi-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Filippo Tedeschi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: langchain-wtmapi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LangChain Document Loader for WTM API — convert any webpage to Markdown
|
|
5
|
+
Author-email: Filippo Tedeschi <filippotedeschi98@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://wtmapi.com
|
|
8
|
+
Project-URL: Documentation, https://wtmapi.com/docs
|
|
9
|
+
Project-URL: Repository, https://github.com/filtede98/langchain-wtmapi
|
|
10
|
+
Keywords: langchain,document-loader,markdown,web-scraping,rag,llm,wtm-api
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: langchain-core>=0.2.0
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# langchain-wtmapi
|
|
28
|
+
|
|
29
|
+
LangChain Document Loader for [WTM API](https://wtmapi.com) — convert any webpage to clean, structured Markdown.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install langchain-wtmapi
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from langchain_wtmapi import WTMApiLoader
|
|
41
|
+
|
|
42
|
+
# Get your free API key at https://wtmapi.com
|
|
43
|
+
loader = WTMApiLoader(
|
|
44
|
+
urls=[
|
|
45
|
+
"https://en.wikipedia.org/wiki/Artificial_intelligence",
|
|
46
|
+
"https://developer.mozilla.org/en-US/docs/Web/JavaScript",
|
|
47
|
+
],
|
|
48
|
+
api_key="wtm_your_api_key",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Load all documents
|
|
52
|
+
docs = loader.load()
|
|
53
|
+
|
|
54
|
+
for doc in docs:
|
|
55
|
+
print(f"Source: {doc.metadata['source']}")
|
|
56
|
+
print(f"Length: {doc.metadata['length']} chars")
|
|
57
|
+
print(f"Response time: {doc.metadata['response_time_ms']}ms")
|
|
58
|
+
print(doc.page_content[:500])
|
|
59
|
+
print("---")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Lazy Loading
|
|
63
|
+
|
|
64
|
+
For large batches, use lazy loading to process documents one at a time:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
for doc in loader.lazy_load():
|
|
68
|
+
# Process each document as it's loaded
|
|
69
|
+
process_document(doc)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Use with RAG Pipeline
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from langchain_wtmapi import WTMApiLoader
|
|
76
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
77
|
+
from langchain_openai import OpenAIEmbeddings
|
|
78
|
+
from langchain_community.vectorstores import FAISS
|
|
79
|
+
|
|
80
|
+
# Load web pages as Markdown
|
|
81
|
+
loader = WTMApiLoader(
|
|
82
|
+
urls=["https://docs.python.org/3/tutorial/index.html"],
|
|
83
|
+
api_key="wtm_your_api_key",
|
|
84
|
+
)
|
|
85
|
+
docs = loader.load()
|
|
86
|
+
|
|
87
|
+
# Split into chunks
|
|
88
|
+
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
|
89
|
+
chunks = splitter.split_documents(docs)
|
|
90
|
+
|
|
91
|
+
# Create vector store
|
|
92
|
+
embeddings = OpenAIEmbeddings()
|
|
93
|
+
vectorstore = FAISS.from_documents(chunks, embeddings)
|
|
94
|
+
|
|
95
|
+
# Query
|
|
96
|
+
results = vectorstore.similarity_search("How do I use lists in Python?")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Options
|
|
100
|
+
|
|
101
|
+
| Parameter | Type | Default | Description |
|
|
102
|
+
|-----------|------|---------|-------------|
|
|
103
|
+
| `urls` | `list[str]` | required | List of URLs to convert |
|
|
104
|
+
| `api_key` | `str` | required | Your WTM API key |
|
|
105
|
+
| `include_links` | `bool` | `True` | Include links in output |
|
|
106
|
+
| `include_images` | `bool` | `True` | Include images in output |
|
|
107
|
+
| `api_url` | `str` | `https://wtmapi.com/api/v1/convert` | API endpoint |
|
|
108
|
+
|
|
109
|
+
## Document Metadata
|
|
110
|
+
|
|
111
|
+
Each loaded document includes metadata:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
{
|
|
115
|
+
"source": "https://example.com", # Original URL
|
|
116
|
+
"length": 15234, # Markdown length in chars
|
|
117
|
+
"response_time_ms": 523, # API response time
|
|
118
|
+
"usage_used": 5, # Monthly calls used
|
|
119
|
+
"usage_limit": 50, # Monthly call limit
|
|
120
|
+
"plan": "free", # Current plan
|
|
121
|
+
}
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Get Your API Key
|
|
125
|
+
|
|
126
|
+
Sign up at [wtmapi.com](https://wtmapi.com) — free tier includes 50 calls/month, no credit card required.
|
|
127
|
+
|
|
128
|
+
## License
|
|
129
|
+
|
|
130
|
+
MIT
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# langchain-wtmapi
|
|
2
|
+
|
|
3
|
+
LangChain Document Loader for [WTM API](https://wtmapi.com) — convert any webpage to clean, structured Markdown.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install langchain-wtmapi
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from langchain_wtmapi import WTMApiLoader
|
|
15
|
+
|
|
16
|
+
# Get your free API key at https://wtmapi.com
|
|
17
|
+
loader = WTMApiLoader(
|
|
18
|
+
urls=[
|
|
19
|
+
"https://en.wikipedia.org/wiki/Artificial_intelligence",
|
|
20
|
+
"https://developer.mozilla.org/en-US/docs/Web/JavaScript",
|
|
21
|
+
],
|
|
22
|
+
api_key="wtm_your_api_key",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Load all documents
|
|
26
|
+
docs = loader.load()
|
|
27
|
+
|
|
28
|
+
for doc in docs:
|
|
29
|
+
print(f"Source: {doc.metadata['source']}")
|
|
30
|
+
print(f"Length: {doc.metadata['length']} chars")
|
|
31
|
+
print(f"Response time: {doc.metadata['response_time_ms']}ms")
|
|
32
|
+
print(doc.page_content[:500])
|
|
33
|
+
print("---")
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Lazy Loading
|
|
37
|
+
|
|
38
|
+
For large batches, use lazy loading to process documents one at a time:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
for doc in loader.lazy_load():
|
|
42
|
+
# Process each document as it's loaded
|
|
43
|
+
process_document(doc)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Use with RAG Pipeline
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from langchain_wtmapi import WTMApiLoader
|
|
50
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
51
|
+
from langchain_openai import OpenAIEmbeddings
|
|
52
|
+
from langchain_community.vectorstores import FAISS
|
|
53
|
+
|
|
54
|
+
# Load web pages as Markdown
|
|
55
|
+
loader = WTMApiLoader(
|
|
56
|
+
urls=["https://docs.python.org/3/tutorial/index.html"],
|
|
57
|
+
api_key="wtm_your_api_key",
|
|
58
|
+
)
|
|
59
|
+
docs = loader.load()
|
|
60
|
+
|
|
61
|
+
# Split into chunks
|
|
62
|
+
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
|
63
|
+
chunks = splitter.split_documents(docs)
|
|
64
|
+
|
|
65
|
+
# Create vector store
|
|
66
|
+
embeddings = OpenAIEmbeddings()
|
|
67
|
+
vectorstore = FAISS.from_documents(chunks, embeddings)
|
|
68
|
+
|
|
69
|
+
# Query
|
|
70
|
+
results = vectorstore.similarity_search("How do I use lists in Python?")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Options
|
|
74
|
+
|
|
75
|
+
| Parameter | Type | Default | Description |
|
|
76
|
+
|-----------|------|---------|-------------|
|
|
77
|
+
| `urls` | `list[str]` | required | List of URLs to convert |
|
|
78
|
+
| `api_key` | `str` | required | Your WTM API key |
|
|
79
|
+
| `include_links` | `bool` | `True` | Include links in output |
|
|
80
|
+
| `include_images` | `bool` | `True` | Include images in output |
|
|
81
|
+
| `api_url` | `str` | `https://wtmapi.com/api/v1/convert` | API endpoint |
|
|
82
|
+
|
|
83
|
+
## Document Metadata
|
|
84
|
+
|
|
85
|
+
Each loaded document includes metadata:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
{
|
|
89
|
+
"source": "https://example.com", # Original URL
|
|
90
|
+
"length": 15234, # Markdown length in chars
|
|
91
|
+
"response_time_ms": 523, # API response time
|
|
92
|
+
"usage_used": 5, # Monthly calls used
|
|
93
|
+
"usage_limit": 50, # Monthly call limit
|
|
94
|
+
"plan": "free", # Current plan
|
|
95
|
+
}
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Get Your API Key
|
|
99
|
+
|
|
100
|
+
Sign up at [wtmapi.com](https://wtmapi.com) — free tier includes 50 calls/month, no credit card required.
|
|
101
|
+
|
|
102
|
+
## License
|
|
103
|
+
|
|
104
|
+
MIT
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""WTM API Document Loader for LangChain.
|
|
2
|
+
|
|
3
|
+
Convert any webpage to clean Markdown using the WTM API (https://wtmapi.com).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import urllib.request
|
|
9
|
+
import json
|
|
10
|
+
from typing import Iterator, Optional
|
|
11
|
+
|
|
12
|
+
from langchain_core.document_loaders import BaseLoader
|
|
13
|
+
from langchain_core.documents import Document
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class WTMApiLoader(BaseLoader):
|
|
17
|
+
"""Load web pages as Markdown documents using the WTM API.
|
|
18
|
+
|
|
19
|
+
WTM API converts any webpage to structured Markdown, preserving
|
|
20
|
+
headings, tables, code blocks, links, and images.
|
|
21
|
+
|
|
22
|
+
Setup:
|
|
23
|
+
Sign up at https://wtmapi.com to get a free API key (50 calls/month).
|
|
24
|
+
|
|
25
|
+
.. code-block:: bash
|
|
26
|
+
|
|
27
|
+
pip install langchain-wtmapi
|
|
28
|
+
|
|
29
|
+
Instantiate:
|
|
30
|
+
.. code-block:: python
|
|
31
|
+
|
|
32
|
+
from langchain_wtmapi import WTMApiLoader
|
|
33
|
+
|
|
34
|
+
loader = WTMApiLoader(
|
|
35
|
+
urls=["https://en.wikipedia.org/wiki/Mars"],
|
|
36
|
+
api_key="wtm_your_api_key",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
Load:
|
|
40
|
+
.. code-block:: python
|
|
41
|
+
|
|
42
|
+
docs = loader.load()
|
|
43
|
+
print(docs[0].page_content[:200])
|
|
44
|
+
print(docs[0].metadata)
|
|
45
|
+
|
|
46
|
+
Lazy load:
|
|
47
|
+
.. code-block:: python
|
|
48
|
+
|
|
49
|
+
for doc in loader.lazy_load():
|
|
50
|
+
print(doc.metadata["source"])
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
urls: list[str]
|
|
54
|
+
"""List of URLs to convert."""
|
|
55
|
+
|
|
56
|
+
api_key: str
|
|
57
|
+
"""WTM API key. Get one free at https://wtmapi.com"""
|
|
58
|
+
|
|
59
|
+
api_url: str = "https://wtmapi.com/api/v1/convert"
|
|
60
|
+
"""WTM API endpoint URL."""
|
|
61
|
+
|
|
62
|
+
include_links: bool = True
|
|
63
|
+
"""Whether to include links in the Markdown output."""
|
|
64
|
+
|
|
65
|
+
include_images: bool = True
|
|
66
|
+
"""Whether to include images in the Markdown output."""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
urls: list[str],
|
|
71
|
+
api_key: str,
|
|
72
|
+
api_url: str = "https://wtmapi.com/api/v1/convert",
|
|
73
|
+
include_links: bool = True,
|
|
74
|
+
include_images: bool = True,
|
|
75
|
+
) -> None:
|
|
76
|
+
self.urls = urls
|
|
77
|
+
self.api_key = api_key
|
|
78
|
+
self.api_url = api_url
|
|
79
|
+
self.include_links = include_links
|
|
80
|
+
self.include_images = include_images
|
|
81
|
+
|
|
82
|
+
def lazy_load(self) -> Iterator[Document]:
|
|
83
|
+
"""Lazy load documents from URLs via WTM API.
|
|
84
|
+
|
|
85
|
+
Yields:
|
|
86
|
+
Document: A document with Markdown content and metadata.
|
|
87
|
+
"""
|
|
88
|
+
for url in self.urls:
|
|
89
|
+
try:
|
|
90
|
+
doc = self._convert_url(url)
|
|
91
|
+
if doc:
|
|
92
|
+
yield doc
|
|
93
|
+
except Exception as e:
|
|
94
|
+
yield Document(
|
|
95
|
+
page_content="",
|
|
96
|
+
metadata={
|
|
97
|
+
"source": url,
|
|
98
|
+
"error": str(e),
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def _convert_url(self, url: str) -> Optional[Document]:
|
|
103
|
+
"""Convert a single URL to a Document via the WTM API."""
|
|
104
|
+
payload = json.dumps({
|
|
105
|
+
"url": url,
|
|
106
|
+
"include_links": self.include_links,
|
|
107
|
+
"include_images": self.include_images,
|
|
108
|
+
}).encode("utf-8")
|
|
109
|
+
|
|
110
|
+
req = urllib.request.Request(
|
|
111
|
+
self.api_url,
|
|
112
|
+
data=payload,
|
|
113
|
+
headers={
|
|
114
|
+
"Content-Type": "application/json",
|
|
115
|
+
"x-api-key": self.api_key,
|
|
116
|
+
},
|
|
117
|
+
method="POST",
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
121
|
+
data = json.loads(resp.read().decode("utf-8"))
|
|
122
|
+
|
|
123
|
+
if not data.get("success"):
|
|
124
|
+
error = data.get("error", "Unknown error")
|
|
125
|
+
raise ValueError(f"WTM API error: {error}")
|
|
126
|
+
|
|
127
|
+
result = data["data"]
|
|
128
|
+
return Document(
|
|
129
|
+
page_content=result["markdown"],
|
|
130
|
+
metadata={
|
|
131
|
+
"source": url,
|
|
132
|
+
"length": result.get("length", 0),
|
|
133
|
+
"response_time_ms": data.get("meta", {}).get("response_time_ms"),
|
|
134
|
+
"usage_used": result.get("usage", {}).get("used"),
|
|
135
|
+
"usage_limit": result.get("usage", {}).get("limit"),
|
|
136
|
+
"plan": result.get("usage", {}).get("plan"),
|
|
137
|
+
},
|
|
138
|
+
)
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: langchain-wtmapi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LangChain Document Loader for WTM API — convert any webpage to Markdown
|
|
5
|
+
Author-email: Filippo Tedeschi <filippotedeschi98@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://wtmapi.com
|
|
8
|
+
Project-URL: Documentation, https://wtmapi.com/docs
|
|
9
|
+
Project-URL: Repository, https://github.com/filtede98/langchain-wtmapi
|
|
10
|
+
Keywords: langchain,document-loader,markdown,web-scraping,rag,llm,wtm-api
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: langchain-core>=0.2.0
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# langchain-wtmapi
|
|
28
|
+
|
|
29
|
+
LangChain Document Loader for [WTM API](https://wtmapi.com) — convert any webpage to clean, structured Markdown.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install langchain-wtmapi
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from langchain_wtmapi import WTMApiLoader
|
|
41
|
+
|
|
42
|
+
# Get your free API key at https://wtmapi.com
|
|
43
|
+
loader = WTMApiLoader(
|
|
44
|
+
urls=[
|
|
45
|
+
"https://en.wikipedia.org/wiki/Artificial_intelligence",
|
|
46
|
+
"https://developer.mozilla.org/en-US/docs/Web/JavaScript",
|
|
47
|
+
],
|
|
48
|
+
api_key="wtm_your_api_key",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Load all documents
|
|
52
|
+
docs = loader.load()
|
|
53
|
+
|
|
54
|
+
for doc in docs:
|
|
55
|
+
print(f"Source: {doc.metadata['source']}")
|
|
56
|
+
print(f"Length: {doc.metadata['length']} chars")
|
|
57
|
+
print(f"Response time: {doc.metadata['response_time_ms']}ms")
|
|
58
|
+
print(doc.page_content[:500])
|
|
59
|
+
print("---")
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Lazy Loading
|
|
63
|
+
|
|
64
|
+
For large batches, use lazy loading to process documents one at a time:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
for doc in loader.lazy_load():
|
|
68
|
+
# Process each document as it's loaded
|
|
69
|
+
process_document(doc)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Use with RAG Pipeline
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from langchain_wtmapi import WTMApiLoader
|
|
76
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
77
|
+
from langchain_openai import OpenAIEmbeddings
|
|
78
|
+
from langchain_community.vectorstores import FAISS
|
|
79
|
+
|
|
80
|
+
# Load web pages as Markdown
|
|
81
|
+
loader = WTMApiLoader(
|
|
82
|
+
urls=["https://docs.python.org/3/tutorial/index.html"],
|
|
83
|
+
api_key="wtm_your_api_key",
|
|
84
|
+
)
|
|
85
|
+
docs = loader.load()
|
|
86
|
+
|
|
87
|
+
# Split into chunks
|
|
88
|
+
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
|
89
|
+
chunks = splitter.split_documents(docs)
|
|
90
|
+
|
|
91
|
+
# Create vector store
|
|
92
|
+
embeddings = OpenAIEmbeddings()
|
|
93
|
+
vectorstore = FAISS.from_documents(chunks, embeddings)
|
|
94
|
+
|
|
95
|
+
# Query
|
|
96
|
+
results = vectorstore.similarity_search("How do I use lists in Python?")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Options
|
|
100
|
+
|
|
101
|
+
| Parameter | Type | Default | Description |
|
|
102
|
+
|-----------|------|---------|-------------|
|
|
103
|
+
| `urls` | `list[str]` | required | List of URLs to convert |
|
|
104
|
+
| `api_key` | `str` | required | Your WTM API key |
|
|
105
|
+
| `include_links` | `bool` | `True` | Include links in output |
|
|
106
|
+
| `include_images` | `bool` | `True` | Include images in output |
|
|
107
|
+
| `api_url` | `str` | `https://wtmapi.com/api/v1/convert` | API endpoint |
|
|
108
|
+
|
|
109
|
+
## Document Metadata
|
|
110
|
+
|
|
111
|
+
Each loaded document includes metadata:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
{
|
|
115
|
+
"source": "https://example.com", # Original URL
|
|
116
|
+
"length": 15234, # Markdown length in chars
|
|
117
|
+
"response_time_ms": 523, # API response time
|
|
118
|
+
"usage_used": 5, # Monthly calls used
|
|
119
|
+
"usage_limit": 50, # Monthly call limit
|
|
120
|
+
"plan": "free", # Current plan
|
|
121
|
+
}
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Get Your API Key
|
|
125
|
+
|
|
126
|
+
Sign up at [wtmapi.com](https://wtmapi.com) — free tier includes 50 calls/month, no credit card required.
|
|
127
|
+
|
|
128
|
+
## License
|
|
129
|
+
|
|
130
|
+
MIT
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
langchain_wtmapi/__init__.py
|
|
5
|
+
langchain_wtmapi/document_loader.py
|
|
6
|
+
langchain_wtmapi.egg-info/PKG-INFO
|
|
7
|
+
langchain_wtmapi.egg-info/SOURCES.txt
|
|
8
|
+
langchain_wtmapi.egg-info/dependency_links.txt
|
|
9
|
+
langchain_wtmapi.egg-info/requires.txt
|
|
10
|
+
langchain_wtmapi.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
langchain-core>=0.2.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
langchain_wtmapi
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "langchain-wtmapi"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "LangChain Document Loader for WTM API — convert any webpage to Markdown"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Filippo Tedeschi", email = "filippotedeschi98@gmail.com"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["langchain", "document-loader", "markdown", "web-scraping", "rag", "llm", "wtm-api"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.9",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Topic :: Software Development :: Libraries",
|
|
26
|
+
"Topic :: Text Processing :: Markup",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"langchain-core>=0.2.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://wtmapi.com"
|
|
34
|
+
Documentation = "https://wtmapi.com/docs"
|
|
35
|
+
Repository = "https://github.com/filtede98/langchain-wtmapi"
|