langchain-wtmapi 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Filippo Tedeschi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,130 @@
1
+ Metadata-Version: 2.4
2
+ Name: langchain-wtmapi
3
+ Version: 0.1.0
4
+ Summary: LangChain Document Loader for WTM API — convert any webpage to Markdown
5
+ Author-email: Filippo Tedeschi <filippotedeschi98@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://wtmapi.com
8
+ Project-URL: Documentation, https://wtmapi.com/docs
9
+ Project-URL: Repository, https://github.com/filtede98/langchain-wtmapi
10
+ Keywords: langchain,document-loader,markdown,web-scraping,rag,llm,wtm-api
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries
20
+ Classifier: Topic :: Text Processing :: Markup
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: langchain-core>=0.2.0
25
+ Dynamic: license-file
26
+
27
+ # langchain-wtmapi
28
+
29
+ LangChain Document Loader for [WTM API](https://wtmapi.com) — convert any webpage to clean, structured Markdown.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install langchain-wtmapi
35
+ ```
36
+
37
+ ## Quick Start
38
+
39
+ ```python
40
+ from langchain_wtmapi import WTMApiLoader
41
+
42
+ # Get your free API key at https://wtmapi.com
43
+ loader = WTMApiLoader(
44
+ urls=[
45
+ "https://en.wikipedia.org/wiki/Artificial_intelligence",
46
+ "https://developer.mozilla.org/en-US/docs/Web/JavaScript",
47
+ ],
48
+ api_key="wtm_your_api_key",
49
+ )
50
+
51
+ # Load all documents
52
+ docs = loader.load()
53
+
54
+ for doc in docs:
55
+ print(f"Source: {doc.metadata['source']}")
56
+ print(f"Length: {doc.metadata['length']} chars")
57
+ print(f"Response time: {doc.metadata['response_time_ms']}ms")
58
+ print(doc.page_content[:500])
59
+ print("---")
60
+ ```
61
+
62
+ ## Lazy Loading
63
+
64
+ For large batches, use lazy loading to process documents one at a time:
65
+
66
+ ```python
67
+ for doc in loader.lazy_load():
68
+ # Process each document as it's loaded
69
+ process_document(doc)
70
+ ```
71
+
72
+ ## Use with RAG Pipeline
73
+
74
+ ```python
75
+ from langchain_wtmapi import WTMApiLoader
76
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
77
+ from langchain_openai import OpenAIEmbeddings
78
+ from langchain_community.vectorstores import FAISS
79
+
80
+ # Load web pages as Markdown
81
+ loader = WTMApiLoader(
82
+ urls=["https://docs.python.org/3/tutorial/index.html"],
83
+ api_key="wtm_your_api_key",
84
+ )
85
+ docs = loader.load()
86
+
87
+ # Split into chunks
88
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
89
+ chunks = splitter.split_documents(docs)
90
+
91
+ # Create vector store
92
+ embeddings = OpenAIEmbeddings()
93
+ vectorstore = FAISS.from_documents(chunks, embeddings)
94
+
95
+ # Query
96
+ results = vectorstore.similarity_search("How do I use lists in Python?")
97
+ ```
98
+
99
+ ## Options
100
+
101
+ | Parameter | Type | Default | Description |
102
+ |-----------|------|---------|-------------|
103
+ | `urls` | `list[str]` | required | List of URLs to convert |
104
+ | `api_key` | `str` | required | Your WTM API key |
105
+ | `include_links` | `bool` | `True` | Include links in output |
106
+ | `include_images` | `bool` | `True` | Include images in output |
107
+ | `api_url` | `str` | `https://wtmapi.com/api/v1/convert` | API endpoint |
108
+
109
+ ## Document Metadata
110
+
111
+ Each loaded document includes metadata:
112
+
113
+ ```python
114
+ {
115
+ "source": "https://example.com", # Original URL
116
+ "length": 15234, # Markdown length in chars
117
+ "response_time_ms": 523, # API response time
118
+ "usage_used": 5, # Monthly calls used
119
+ "usage_limit": 50, # Monthly call limit
120
+ "plan": "free", # Current plan
121
+ }
122
+ ```
123
+
124
+ ## Get Your API Key
125
+
126
+ Sign up at [wtmapi.com](https://wtmapi.com) — free tier includes 50 calls/month, no credit card required.
127
+
128
+ ## License
129
+
130
+ MIT
@@ -0,0 +1,104 @@
1
+ # langchain-wtmapi
2
+
3
+ LangChain Document Loader for [WTM API](https://wtmapi.com) — convert any webpage to clean, structured Markdown.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install langchain-wtmapi
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```python
14
+ from langchain_wtmapi import WTMApiLoader
15
+
16
+ # Get your free API key at https://wtmapi.com
17
+ loader = WTMApiLoader(
18
+ urls=[
19
+ "https://en.wikipedia.org/wiki/Artificial_intelligence",
20
+ "https://developer.mozilla.org/en-US/docs/Web/JavaScript",
21
+ ],
22
+ api_key="wtm_your_api_key",
23
+ )
24
+
25
+ # Load all documents
26
+ docs = loader.load()
27
+
28
+ for doc in docs:
29
+ print(f"Source: {doc.metadata['source']}")
30
+ print(f"Length: {doc.metadata['length']} chars")
31
+ print(f"Response time: {doc.metadata['response_time_ms']}ms")
32
+ print(doc.page_content[:500])
33
+ print("---")
34
+ ```
35
+
36
+ ## Lazy Loading
37
+
38
+ For large batches, use lazy loading to process documents one at a time:
39
+
40
+ ```python
41
+ for doc in loader.lazy_load():
42
+ # Process each document as it's loaded
43
+ process_document(doc)
44
+ ```
45
+
46
+ ## Use with RAG Pipeline
47
+
48
+ ```python
49
+ from langchain_wtmapi import WTMApiLoader
50
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
51
+ from langchain_openai import OpenAIEmbeddings
52
+ from langchain_community.vectorstores import FAISS
53
+
54
+ # Load web pages as Markdown
55
+ loader = WTMApiLoader(
56
+ urls=["https://docs.python.org/3/tutorial/index.html"],
57
+ api_key="wtm_your_api_key",
58
+ )
59
+ docs = loader.load()
60
+
61
+ # Split into chunks
62
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
63
+ chunks = splitter.split_documents(docs)
64
+
65
+ # Create vector store
66
+ embeddings = OpenAIEmbeddings()
67
+ vectorstore = FAISS.from_documents(chunks, embeddings)
68
+
69
+ # Query
70
+ results = vectorstore.similarity_search("How do I use lists in Python?")
71
+ ```
72
+
73
+ ## Options
74
+
75
+ | Parameter | Type | Default | Description |
76
+ |-----------|------|---------|-------------|
77
+ | `urls` | `list[str]` | required | List of URLs to convert |
78
+ | `api_key` | `str` | required | Your WTM API key |
79
+ | `include_links` | `bool` | `True` | Include links in output |
80
+ | `include_images` | `bool` | `True` | Include images in output |
81
+ | `api_url` | `str` | `https://wtmapi.com/api/v1/convert` | API endpoint |
82
+
83
+ ## Document Metadata
84
+
85
+ Each loaded document includes metadata:
86
+
87
+ ```python
88
+ {
89
+ "source": "https://example.com", # Original URL
90
+ "length": 15234, # Markdown length in chars
91
+ "response_time_ms": 523, # API response time
92
+ "usage_used": 5, # Monthly calls used
93
+ "usage_limit": 50, # Monthly call limit
94
+ "plan": "free", # Current plan
95
+ }
96
+ ```
97
+
98
+ ## Get Your API Key
99
+
100
+ Sign up at [wtmapi.com](https://wtmapi.com) — free tier includes 50 calls/month, no credit card required.
101
+
102
+ ## License
103
+
104
+ MIT
@@ -0,0 +1,3 @@
1
+ from langchain_wtmapi.document_loader import WTMApiLoader
2
+
3
+ __all__ = ["WTMApiLoader"]
@@ -0,0 +1,138 @@
1
+ """WTM API Document Loader for LangChain.
2
+
3
+ Convert any webpage to clean Markdown using the WTM API (https://wtmapi.com).
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import urllib.request
9
+ import json
10
+ from typing import Iterator, Optional
11
+
12
+ from langchain_core.document_loaders import BaseLoader
13
+ from langchain_core.documents import Document
14
+
15
+
16
+ class WTMApiLoader(BaseLoader):
17
+ """Load web pages as Markdown documents using the WTM API.
18
+
19
+ WTM API converts any webpage to structured Markdown, preserving
20
+ headings, tables, code blocks, links, and images.
21
+
22
+ Setup:
23
+ Sign up at https://wtmapi.com to get a free API key (50 calls/month).
24
+
25
+ .. code-block:: bash
26
+
27
+ pip install langchain-wtmapi
28
+
29
+ Instantiate:
30
+ .. code-block:: python
31
+
32
+ from langchain_wtmapi import WTMApiLoader
33
+
34
+ loader = WTMApiLoader(
35
+ urls=["https://en.wikipedia.org/wiki/Mars"],
36
+ api_key="wtm_your_api_key",
37
+ )
38
+
39
+ Load:
40
+ .. code-block:: python
41
+
42
+ docs = loader.load()
43
+ print(docs[0].page_content[:200])
44
+ print(docs[0].metadata)
45
+
46
+ Lazy load:
47
+ .. code-block:: python
48
+
49
+ for doc in loader.lazy_load():
50
+ print(doc.metadata["source"])
51
+ """
52
+
53
+ urls: list[str]
54
+ """List of URLs to convert."""
55
+
56
+ api_key: str
57
+ """WTM API key. Get one free at https://wtmapi.com"""
58
+
59
+ api_url: str = "https://wtmapi.com/api/v1/convert"
60
+ """WTM API endpoint URL."""
61
+
62
+ include_links: bool = True
63
+ """Whether to include links in the Markdown output."""
64
+
65
+ include_images: bool = True
66
+ """Whether to include images in the Markdown output."""
67
+
68
+ def __init__(
69
+ self,
70
+ urls: list[str],
71
+ api_key: str,
72
+ api_url: str = "https://wtmapi.com/api/v1/convert",
73
+ include_links: bool = True,
74
+ include_images: bool = True,
75
+ ) -> None:
76
+ self.urls = urls
77
+ self.api_key = api_key
78
+ self.api_url = api_url
79
+ self.include_links = include_links
80
+ self.include_images = include_images
81
+
82
+ def lazy_load(self) -> Iterator[Document]:
83
+ """Lazy load documents from URLs via WTM API.
84
+
85
+ Yields:
86
+ Document: A document with Markdown content and metadata.
87
+ """
88
+ for url in self.urls:
89
+ try:
90
+ doc = self._convert_url(url)
91
+ if doc:
92
+ yield doc
93
+ except Exception as e:
94
+ yield Document(
95
+ page_content="",
96
+ metadata={
97
+ "source": url,
98
+ "error": str(e),
99
+ },
100
+ )
101
+
102
+ def _convert_url(self, url: str) -> Optional[Document]:
103
+ """Convert a single URL to a Document via the WTM API."""
104
+ payload = json.dumps({
105
+ "url": url,
106
+ "include_links": self.include_links,
107
+ "include_images": self.include_images,
108
+ }).encode("utf-8")
109
+
110
+ req = urllib.request.Request(
111
+ self.api_url,
112
+ data=payload,
113
+ headers={
114
+ "Content-Type": "application/json",
115
+ "x-api-key": self.api_key,
116
+ },
117
+ method="POST",
118
+ )
119
+
120
+ with urllib.request.urlopen(req, timeout=30) as resp:
121
+ data = json.loads(resp.read().decode("utf-8"))
122
+
123
+ if not data.get("success"):
124
+ error = data.get("error", "Unknown error")
125
+ raise ValueError(f"WTM API error: {error}")
126
+
127
+ result = data["data"]
128
+ return Document(
129
+ page_content=result["markdown"],
130
+ metadata={
131
+ "source": url,
132
+ "length": result.get("length", 0),
133
+ "response_time_ms": data.get("meta", {}).get("response_time_ms"),
134
+ "usage_used": result.get("usage", {}).get("used"),
135
+ "usage_limit": result.get("usage", {}).get("limit"),
136
+ "plan": result.get("usage", {}).get("plan"),
137
+ },
138
+ )
@@ -0,0 +1,130 @@
1
+ Metadata-Version: 2.4
2
+ Name: langchain-wtmapi
3
+ Version: 0.1.0
4
+ Summary: LangChain Document Loader for WTM API — convert any webpage to Markdown
5
+ Author-email: Filippo Tedeschi <filippotedeschi98@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://wtmapi.com
8
+ Project-URL: Documentation, https://wtmapi.com/docs
9
+ Project-URL: Repository, https://github.com/filtede98/langchain-wtmapi
10
+ Keywords: langchain,document-loader,markdown,web-scraping,rag,llm,wtm-api
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.9
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries
20
+ Classifier: Topic :: Text Processing :: Markup
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: langchain-core>=0.2.0
25
+ Dynamic: license-file
26
+
27
+ # langchain-wtmapi
28
+
29
+ LangChain Document Loader for [WTM API](https://wtmapi.com) — convert any webpage to clean, structured Markdown.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install langchain-wtmapi
35
+ ```
36
+
37
+ ## Quick Start
38
+
39
+ ```python
40
+ from langchain_wtmapi import WTMApiLoader
41
+
42
+ # Get your free API key at https://wtmapi.com
43
+ loader = WTMApiLoader(
44
+ urls=[
45
+ "https://en.wikipedia.org/wiki/Artificial_intelligence",
46
+ "https://developer.mozilla.org/en-US/docs/Web/JavaScript",
47
+ ],
48
+ api_key="wtm_your_api_key",
49
+ )
50
+
51
+ # Load all documents
52
+ docs = loader.load()
53
+
54
+ for doc in docs:
55
+ print(f"Source: {doc.metadata['source']}")
56
+ print(f"Length: {doc.metadata['length']} chars")
57
+ print(f"Response time: {doc.metadata['response_time_ms']}ms")
58
+ print(doc.page_content[:500])
59
+ print("---")
60
+ ```
61
+
62
+ ## Lazy Loading
63
+
64
+ For large batches, use lazy loading to process documents one at a time:
65
+
66
+ ```python
67
+ for doc in loader.lazy_load():
68
+ # Process each document as it's loaded
69
+ process_document(doc)
70
+ ```
71
+
72
+ ## Use with RAG Pipeline
73
+
74
+ ```python
75
+ from langchain_wtmapi import WTMApiLoader
76
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
77
+ from langchain_openai import OpenAIEmbeddings
78
+ from langchain_community.vectorstores import FAISS
79
+
80
+ # Load web pages as Markdown
81
+ loader = WTMApiLoader(
82
+ urls=["https://docs.python.org/3/tutorial/index.html"],
83
+ api_key="wtm_your_api_key",
84
+ )
85
+ docs = loader.load()
86
+
87
+ # Split into chunks
88
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
89
+ chunks = splitter.split_documents(docs)
90
+
91
+ # Create vector store
92
+ embeddings = OpenAIEmbeddings()
93
+ vectorstore = FAISS.from_documents(chunks, embeddings)
94
+
95
+ # Query
96
+ results = vectorstore.similarity_search("How do I use lists in Python?")
97
+ ```
98
+
99
+ ## Options
100
+
101
+ | Parameter | Type | Default | Description |
102
+ |-----------|------|---------|-------------|
103
+ | `urls` | `list[str]` | required | List of URLs to convert |
104
+ | `api_key` | `str` | required | Your WTM API key |
105
+ | `include_links` | `bool` | `True` | Include links in output |
106
+ | `include_images` | `bool` | `True` | Include images in output |
107
+ | `api_url` | `str` | `https://wtmapi.com/api/v1/convert` | API endpoint |
108
+
109
+ ## Document Metadata
110
+
111
+ Each loaded document includes metadata:
112
+
113
+ ```python
114
+ {
115
+ "source": "https://example.com", # Original URL
116
+ "length": 15234, # Markdown length in chars
117
+ "response_time_ms": 523, # API response time
118
+ "usage_used": 5, # Monthly calls used
119
+ "usage_limit": 50, # Monthly call limit
120
+ "plan": "free", # Current plan
121
+ }
122
+ ```
123
+
124
+ ## Get Your API Key
125
+
126
+ Sign up at [wtmapi.com](https://wtmapi.com) — free tier includes 50 calls/month, no credit card required.
127
+
128
+ ## License
129
+
130
+ MIT
@@ -0,0 +1,10 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ langchain_wtmapi/__init__.py
5
+ langchain_wtmapi/document_loader.py
6
+ langchain_wtmapi.egg-info/PKG-INFO
7
+ langchain_wtmapi.egg-info/SOURCES.txt
8
+ langchain_wtmapi.egg-info/dependency_links.txt
9
+ langchain_wtmapi.egg-info/requires.txt
10
+ langchain_wtmapi.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ langchain-core>=0.2.0
@@ -0,0 +1 @@
1
+ langchain_wtmapi
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "langchain-wtmapi"
7
+ version = "0.1.0"
8
+ description = "LangChain Document Loader for WTM API — convert any webpage to Markdown"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.9"
12
+ authors = [
13
+ {name = "Filippo Tedeschi", email = "filippotedeschi98@gmail.com"},
14
+ ]
15
+ keywords = ["langchain", "document-loader", "markdown", "web-scraping", "rag", "llm", "wtm-api"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.9",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Topic :: Software Development :: Libraries",
26
+ "Topic :: Text Processing :: Markup",
27
+ ]
28
+ dependencies = [
29
+ "langchain-core>=0.2.0",
30
+ ]
31
+
32
+ [project.urls]
33
+ Homepage = "https://wtmapi.com"
34
+ Documentation = "https://wtmapi.com/docs"
35
+ Repository = "https://github.com/filtede98/langchain-wtmapi"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+