data-prep-connector 0.2.2__tar.gz → 0.2.2.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/Makefile +1 -3
  2. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/PKG-INFO +1 -1
  3. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/doc/overview.md +0 -14
  4. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/pyproject.toml +1 -1
  5. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/data_prep_connector.egg-info/PKG-INFO +1 -1
  6. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/core/crawler.py +0 -6
  7. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/core/spiders/sitemap.py +6 -14
  8. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/core/utils.py +0 -5
  9. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/test/dpk_connector/core/test_crawler.py +1 -12
  10. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/test/dpk_connector/core/test_middlewares.py +0 -12
  11. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/test/dpk_connector/core/test_sitemap_spider.py +9 -31
  12. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/test/dpk_connector/core/test_utils.py +3 -29
  13. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/README.md +0 -0
  14. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/setup.cfg +0 -0
  15. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/data_prep_connector.egg-info/SOURCES.txt +0 -0
  16. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/data_prep_connector.egg-info/dependency_links.txt +0 -0
  17. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/data_prep_connector.egg-info/requires.txt +0 -0
  18. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/data_prep_connector.egg-info/top_level.txt +0 -0
  19. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/__init__.py +0 -0
  20. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/core/__init__.py +0 -0
  21. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/core/item.py +0 -0
  22. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/core/logging.py +0 -0
  23. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/core/middlewares.py +0 -0
  24. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/core/pipelines.py +0 -0
  25. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/core/settings.py +0 -0
  26. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/src/dpk_connector/core/spiders/__init__.py +0 -0
  27. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/test/dpk_connector/core/__init__.py +0 -0
  28. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev0}/test/dpk_connector/core/test_sitemap_spider/index.html +0 -0
@@ -2,8 +2,6 @@
2
2
  REPOROOT=..
3
3
  include $(REPOROOT)/.make.defaults
4
4
 
5
- DPK_CONNECTOR_VERSION=0.2.2
6
-
7
5
  clean::
8
6
  @# Help: Clean up the distribution build and the venv
9
7
  rm -rf dist venv
@@ -15,7 +13,7 @@ clean::
15
13
  setup::
16
14
 
17
15
  set-versions: .check-env
18
- $(MAKE) TOML_VERSION=$(DPK_CONNECTOR_VERSION) .defaults.update-toml
16
+ $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
19
17
 
20
18
  build:: build-dist
21
19
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_connector
3
- Version: 0.2.2
3
+ Version: 0.2.2.dev0
4
4
  Summary: Scalable and Compliant Web Crawler
5
5
  Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
6
  License: Apache-2.0
@@ -10,20 +10,6 @@ Features:
10
10
  - Mime type filters: You can restrict mime types which can be downloaded.
11
11
  - Parallel processing: Requests to websites are processed in parallel.
12
12
 
13
- ## How to install
14
-
15
- ### From PyPI
16
-
17
- ```sh
18
- pip install data-prep-connector
19
- ```
20
-
21
- ### From Github
22
-
23
- ```sh
24
- pip install git+https://github.com/IBM/data-prep-kit.git@dev#subdirectory=data-connector-lib
25
- ```
26
-
27
13
  ## Example usage
28
14
 
29
15
  ```python
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data_prep_connector"
3
- version = "0.2.2"
3
+ version = "0.2.2.dev0"
4
4
  requires-python = ">=3.10"
5
5
  keywords = [
6
6
  "data",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_connector
3
- Version: 0.2.2
3
+ Version: 0.2.2.dev0
4
4
  Summary: Scalable and Compliant Web Crawler
5
5
  Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
6
  License: Apache-2.0
@@ -74,7 +74,6 @@ def async_crawl(
74
74
  user_agent: str = "",
75
75
  headers: dict[str, str] = {},
76
76
  allow_domains: Collection[str] = (),
77
- subdomain_focus: bool = False,
78
77
  path_focus: bool = False,
79
78
  allow_mime_types: Collection[str] = (
80
79
  "application/pdf",
@@ -97,7 +96,6 @@ def async_crawl(
97
96
  user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
98
97
  headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
99
98
  allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
100
- subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
101
99
  path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
102
100
  allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
103
101
  disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -142,7 +140,6 @@ def async_crawl(
142
140
  seed_urls=seed_urls,
143
141
  callback=on_downloaded,
144
142
  allow_domains=allow_domains,
145
- subdomain_focus=subdomain_focus,
146
143
  path_focus=path_focus,
147
144
  allow_mime_types=allow_mime_types,
148
145
  disallow_mime_types=disallow_mime_types,
@@ -158,7 +155,6 @@ def crawl(
158
155
  user_agent: str = "",
159
156
  headers: dict[str, str] = {},
160
157
  allow_domains: Collection[str] = (),
161
- subdomain_focus: bool = False,
162
158
  path_focus: bool = False,
163
159
  allow_mime_types: Collection[str] = (
164
160
  "application/pdf",
@@ -181,7 +177,6 @@ def crawl(
181
177
  user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
182
178
  headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
183
179
  allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
184
- subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
185
180
  path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
186
181
  allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
187
182
  disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -203,7 +198,6 @@ def crawl(
203
198
  user_agent,
204
199
  headers,
205
200
  allow_domains,
206
- subdomain_focus,
207
201
  path_focus,
208
202
  allow_mime_types,
209
203
  disallow_mime_types,
@@ -28,7 +28,6 @@ from dpk_connector.core.utils import (
28
28
  get_content_type,
29
29
  get_etld1,
30
30
  get_focus_path,
31
- get_fqdn,
32
31
  is_allowed_path,
33
32
  urlparse_cached,
34
33
  )
@@ -43,7 +42,6 @@ class BaseSitemapSpider(SitemapSpider):
43
42
  self,
44
43
  seed_urls: Collection[str],
45
44
  allow_domains: Collection[str] = (),
46
- subdomain_focus: bool = False,
47
45
  path_focus: bool = False,
48
46
  allow_mime_types: Collection[str] = (),
49
47
  disallow_mime_types: Collection[str] = (),
@@ -90,15 +88,11 @@ class BaseSitemapSpider(SitemapSpider):
90
88
  self.focus_paths.add(path)
91
89
 
92
90
  # Domains and mime types filtering
93
- if allow_domains:
94
- self.allowed_domains = set(allow_domains)
95
- elif subdomain_focus:
96
- self.allowed_domains = set()
97
- for url in seed_urls:
98
- if fqdn := get_fqdn(url):
99
- self.allowed_domains.add(fqdn)
100
- else:
101
- self.allowed_domains = set(get_etld1(url) for url in seed_urls)
91
+ self.allowed_domains = set(
92
+ allow_domains
93
+ if len(allow_domains) > 0
94
+ else [get_etld1(url) for url in seed_urls]
95
+ )
102
96
  self.allow_mime_types = set(
103
97
  [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
104
98
  )
@@ -161,9 +155,7 @@ class BaseSitemapSpider(SitemapSpider):
161
155
  )
162
156
 
163
157
  def _parse_sitemap(self, response: Response):
164
- yield ConnectorItem(
165
- dropped=False, downloaded=False, system_request=True, sitemap=True
166
- )
158
+ yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
167
159
 
168
160
  seed_url = response.meta["seed_url"]
169
161
 
@@ -57,11 +57,6 @@ def get_etld1(url: str) -> str:
57
57
  return f"{ext.domain}.{ext.suffix}"
58
58
 
59
59
 
60
- def get_fqdn(url: str) -> str:
61
- ext = tldextract.extract(url)
62
- return ext.fqdn
63
-
64
-
65
60
  def get_focus_path(url: str) -> str | None:
66
61
  parts = urlparse_cached(url)
67
62
  if len(parts.path.split("/")) > 2:
@@ -1,16 +1,5 @@
1
- # (C) Copyright IBM Corp. 2024.
2
- # Licensed under the Apache License, Version 2.0 (the “License”);
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- # Unless required by applicable law or agreed to in writing, software
7
- # distributed under the License is distributed on an “AS IS” BASIS,
8
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
- # See the License for the specific language governing permissions and
10
- # limitations under the License.
11
- ################################################################################
12
-
13
1
  import pytest
2
+
14
3
  from dpk_connector.core.crawler import crawl
15
4
 
16
5
 
@@ -1,15 +1,3 @@
1
- # (C) Copyright IBM Corp. 2024.
2
- # Licensed under the Apache License, Version 2.0 (the “License”);
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- # Unless required by applicable law or agreed to in writing, software
7
- # distributed under the License is distributed on an “AS IS” BASIS,
8
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
- # See the License for the specific language governing permissions and
10
- # limitations under the License.
11
- ################################################################################
12
-
13
1
  import pytest
14
2
  from dpk_connector.core.middlewares import DelayingProtegoRobotParser
15
3
  from pytest_mock import MockerFixture
@@ -1,24 +1,13 @@
1
- # (C) Copyright IBM Corp. 2024.
2
- # Licensed under the Apache License, Version 2.0 (the “License”);
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- # Unless required by applicable law or agreed to in writing, software
7
- # distributed under the License is distributed on an “AS IS” BASIS,
8
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
- # See the License for the specific language governing permissions and
10
- # limitations under the License.
11
- ################################################################################
12
-
13
1
  from pathlib import Path
14
2
 
15
3
  import pytest
16
- from dpk_connector.core.item import ConnectorItem
17
- from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
18
4
  from scrapy import Request
19
5
  from scrapy.crawler import Crawler
20
6
  from scrapy.http import HtmlResponse
21
7
 
8
+ from dpk_connector.core.item import ConnectorItem
9
+ from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
10
+
22
11
 
23
12
  @pytest.fixture
24
13
  def crawler() -> Crawler:
@@ -33,21 +22,6 @@ def crawler() -> Crawler:
33
22
  return crawler
34
23
 
35
24
 
36
- def test_init_subdomain_focus():
37
- spider = BaseSitemapSpider(
38
- seed_urls=(
39
- "http://blog.example.com/",
40
- "http://contents.example.com/",
41
- ),
42
- subdomain_focus=True,
43
- )
44
- assert spider.seed_urls == {
45
- "http://blog.example.com/",
46
- "http://contents.example.com/",
47
- }
48
- assert spider.allowed_domains == {"blog.example.com", "contents.example.com"}
49
-
50
-
51
25
  def test_init_path_focus():
52
26
  spider = BaseSitemapSpider(
53
27
  seed_urls=(
@@ -85,7 +59,9 @@ def test_parse(datadir: Path, crawler: Crawler):
85
59
  assert body.decode("utf-8") == response_body
86
60
  assert headers == {"Content-Type": "text/html"}
87
61
 
88
- spider = ConnectorSitemapSpider.from_crawler(crawler, seed_urls=("http://example.com",), callback=callback)
62
+ spider = ConnectorSitemapSpider.from_crawler(
63
+ crawler, seed_urls=("http://example.com",), callback=callback
64
+ )
89
65
  request = Request(
90
66
  "http://example.com/index.html",
91
67
  meta={
@@ -103,7 +79,9 @@ def test_parse(datadir: Path, crawler: Crawler):
103
79
  parsed = spider.parse(response)
104
80
 
105
81
  item = next(parsed)
106
- assert item == ConnectorItem(dropped=False, downloaded=True, system_request=False, sitemap=False)
82
+ assert item == ConnectorItem(
83
+ dropped=False, downloaded=True, system_request=False, sitemap=False
84
+ )
107
85
 
108
86
  for next_request in parsed:
109
87
  assert isinstance(next_request, Request) is True
@@ -1,15 +1,3 @@
1
- # (C) Copyright IBM Corp. 2024.
2
- # Licensed under the Apache License, Version 2.0 (the “License”);
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- # Unless required by applicable law or agreed to in writing, software
7
- # distributed under the License is distributed on an “AS IS” BASIS,
8
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
- # See the License for the specific language governing permissions and
10
- # limitations under the License.
11
- ################################################################################
12
-
13
1
  # Assisted by WCA@IBM
14
2
  # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
15
3
 
@@ -19,7 +7,6 @@ from dpk_connector.core.utils import (
19
7
  get_content_type,
20
8
  get_etld1,
21
9
  get_focus_path,
22
- get_fqdn,
23
10
  get_header_value,
24
11
  get_mime_type,
25
12
  is_allowed_path,
@@ -32,7 +19,9 @@ from scrapy.http import Request, Response
32
19
 
33
20
 
34
21
  def test_get_header_value():
35
- response = Response("http://example.com", headers={"Content-Type": "application/json"})
22
+ response = Response(
23
+ "http://example.com", headers={"Content-Type": "application/json"}
24
+ )
36
25
  assert get_header_value(response, "Content-Type") == "application/json"
37
26
 
38
27
 
@@ -94,21 +83,6 @@ def test_get_etld1(url: str, expected: str):
94
83
  assert get_etld1(url) == expected
95
84
 
96
85
 
97
- @pytest.mark.parametrize(
98
- "url,expected",
99
- [
100
- ("http://www.example.com", "www.example.com"),
101
- ("https://www.example.co.uk", "www.example.co.uk"),
102
- ("http://www.example.com/path?query=string#fragment", "www.example.com"),
103
- ("http://localhost:8080/", ""),
104
- ("http://www.example.com:8080/", "www.example.com"),
105
- ("http://www.sub.example.com:8080/", "www.sub.example.com"),
106
- ],
107
- )
108
- def test_get_fqdn(url: str, expected: str):
109
- assert get_fqdn(url) == expected
110
-
111
-
112
86
  @pytest.mark.parametrize(
113
87
  "url,expected",
114
88
  [