data-prep-connector 0.2.2__tar.gz → 0.2.2.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/Makefile +1 -3
  2. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/PKG-INFO +1 -1
  3. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/pyproject.toml +1 -1
  4. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/data_prep_connector.egg-info/PKG-INFO +1 -1
  5. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/test_crawler.py +1 -12
  6. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/test_middlewares.py +0 -12
  7. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/test_sitemap_spider.py +6 -14
  8. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/test_utils.py +3 -13
  9. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/README.md +0 -0
  10. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/doc/overview.md +0 -0
  11. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/setup.cfg +0 -0
  12. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/data_prep_connector.egg-info/SOURCES.txt +0 -0
  13. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/data_prep_connector.egg-info/dependency_links.txt +0 -0
  14. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/data_prep_connector.egg-info/requires.txt +0 -0
  15. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/data_prep_connector.egg-info/top_level.txt +0 -0
  16. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/__init__.py +0 -0
  17. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/__init__.py +0 -0
  18. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/crawler.py +0 -0
  19. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/item.py +0 -0
  20. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/logging.py +0 -0
  21. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/middlewares.py +0 -0
  22. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/pipelines.py +0 -0
  23. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/settings.py +0 -0
  24. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/spiders/__init__.py +0 -0
  25. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/spiders/sitemap.py +0 -0
  26. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/src/dpk_connector/core/utils.py +0 -0
  27. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/__init__.py +0 -0
  28. {data_prep_connector-0.2.2 → data_prep_connector-0.2.2.dev1}/test/dpk_connector/core/test_sitemap_spider/index.html +0 -0
@@ -2,8 +2,6 @@
2
2
  REPOROOT=..
3
3
  include $(REPOROOT)/.make.defaults
4
4
 
5
- DPK_CONNECTOR_VERSION=0.2.2
6
-
7
5
  clean::
8
6
  @# Help: Clean up the distribution build and the venv
9
7
  rm -rf dist venv
@@ -15,7 +13,7 @@ clean::
15
13
  setup::
16
14
 
17
15
  set-versions: .check-env
18
- $(MAKE) TOML_VERSION=$(DPK_CONNECTOR_VERSION) .defaults.update-toml
16
+ $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
19
17
 
20
18
  build:: build-dist
21
19
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_connector
3
- Version: 0.2.2
3
+ Version: 0.2.2.dev1
4
4
  Summary: Scalable and Compliant Web Crawler
5
5
  Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
6
  License: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data_prep_connector"
3
- version = "0.2.2"
3
+ version = "0.2.2.dev1"
4
4
  requires-python = ">=3.10"
5
5
  keywords = [
6
6
  "data",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_connector
3
- Version: 0.2.2
3
+ Version: 0.2.2.dev1
4
4
  Summary: Scalable and Compliant Web Crawler
5
5
  Author-email: Hiroya Matsubara <hmtbr@jp.ibm.com>
6
6
  License: Apache-2.0
@@ -1,16 +1,5 @@
1
- # (C) Copyright IBM Corp. 2024.
2
- # Licensed under the Apache License, Version 2.0 (the “License”);
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- # Unless required by applicable law or agreed to in writing, software
7
- # distributed under the License is distributed on an “AS IS” BASIS,
8
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
- # See the License for the specific language governing permissions and
10
- # limitations under the License.
11
- ################################################################################
12
-
13
1
  import pytest
2
+
14
3
  from dpk_connector.core.crawler import crawl
15
4
 
16
5
 
@@ -1,15 +1,3 @@
1
- # (C) Copyright IBM Corp. 2024.
2
- # Licensed under the Apache License, Version 2.0 (the “License”);
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- # Unless required by applicable law or agreed to in writing, software
7
- # distributed under the License is distributed on an “AS IS” BASIS,
8
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
- # See the License for the specific language governing permissions and
10
- # limitations under the License.
11
- ################################################################################
12
-
13
1
  import pytest
14
2
  from dpk_connector.core.middlewares import DelayingProtegoRobotParser
15
3
  from pytest_mock import MockerFixture
@@ -1,15 +1,3 @@
1
- # (C) Copyright IBM Corp. 2024.
2
- # Licensed under the Apache License, Version 2.0 (the “License”);
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- # Unless required by applicable law or agreed to in writing, software
7
- # distributed under the License is distributed on an “AS IS” BASIS,
8
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
- # See the License for the specific language governing permissions and
10
- # limitations under the License.
11
- ################################################################################
12
-
13
1
  from pathlib import Path
14
2
 
15
3
  import pytest
@@ -85,7 +73,9 @@ def test_parse(datadir: Path, crawler: Crawler):
85
73
  assert body.decode("utf-8") == response_body
86
74
  assert headers == {"Content-Type": "text/html"}
87
75
 
88
- spider = ConnectorSitemapSpider.from_crawler(crawler, seed_urls=("http://example.com",), callback=callback)
76
+ spider = ConnectorSitemapSpider.from_crawler(
77
+ crawler, seed_urls=("http://example.com",), callback=callback
78
+ )
89
79
  request = Request(
90
80
  "http://example.com/index.html",
91
81
  meta={
@@ -103,7 +93,9 @@ def test_parse(datadir: Path, crawler: Crawler):
103
93
  parsed = spider.parse(response)
104
94
 
105
95
  item = next(parsed)
106
- assert item == ConnectorItem(dropped=False, downloaded=True, system_request=False, sitemap=False)
96
+ assert item == ConnectorItem(
97
+ dropped=False, downloaded=True, system_request=False, sitemap=False
98
+ )
107
99
 
108
100
  for next_request in parsed:
109
101
  assert isinstance(next_request, Request) is True
@@ -1,15 +1,3 @@
1
- # (C) Copyright IBM Corp. 2024.
2
- # Licensed under the Apache License, Version 2.0 (the “License”);
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- # Unless required by applicable law or agreed to in writing, software
7
- # distributed under the License is distributed on an “AS IS” BASIS,
8
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
- # See the License for the specific language governing permissions and
10
- # limitations under the License.
11
- ################################################################################
12
-
13
1
  # Assisted by WCA@IBM
14
2
  # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
15
3
 
@@ -32,7 +20,9 @@ from scrapy.http import Request, Response
32
20
 
33
21
 
34
22
  def test_get_header_value():
35
- response = Response("http://example.com", headers={"Content-Type": "application/json"})
23
+ response = Response(
24
+ "http://example.com", headers={"Content-Type": "application/json"}
25
+ )
36
26
  assert get_header_value(response, "Content-Type") == "application/json"
37
27
 
38
28