crossref-matcher 0.0.dev2317948835__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. crossref_matcher-0.0.dev2317948835/LICENSE +21 -0
  2. crossref_matcher-0.0.dev2317948835/MANIFEST.in +5 -0
  3. crossref_matcher-0.0.dev2317948835/PKG-INFO +34 -0
  4. crossref_matcher-0.0.dev2317948835/README.md +107 -0
  5. crossref_matcher-0.0.dev2317948835/crossref_matcher/MatchTask.py +36 -0
  6. crossref_matcher-0.0.dev2317948835/crossref_matcher/__init__.py +6 -0
  7. crossref_matcher-0.0.dev2317948835/crossref_matcher/app.py +248 -0
  8. crossref_matcher-0.0.dev2317948835/crossref_matcher/evaluation/__init__.py +13 -0
  9. crossref_matcher-0.0.dev2317948835/crossref_matcher/evaluation/all_links.py +185 -0
  10. crossref_matcher-0.0.dev2317948835/crossref_matcher/evaluation/api_routes.py +20 -0
  11. crossref_matcher-0.0.dev2317948835/crossref_matcher/evaluation/dataset_ingest.py +143 -0
  12. crossref_matcher-0.0.dev2317948835/crossref_matcher/evaluation/evaluation.py +370 -0
  13. crossref_matcher-0.0.dev2317948835/crossref_matcher/evaluation/google_sheets_connector.py +33 -0
  14. crossref_matcher-0.0.dev2317948835/crossref_matcher/evaluation/ror_utils.py +14 -0
  15. crossref_matcher-0.0.dev2317948835/crossref_matcher/evaluation/schemas.py +128 -0
  16. crossref_matcher-0.0.dev2317948835/crossref_matcher/indexes/__init__.py +0 -0
  17. crossref_matcher-0.0.dev2317948835/crossref_matcher/indexes/preprint_matching/__init__.py +0 -0
  18. crossref_matcher-0.0.dev2317948835/crossref_matcher/indexes/preprint_matching/indexer_service.py +160 -0
  19. crossref_matcher-0.0.dev2317948835/crossref_matcher/indexes/ror_organizations/__init__.py +0 -0
  20. crossref_matcher-0.0.dev2317948835/crossref_matcher/indexes/ror_organizations/create_index.py +115 -0
  21. crossref_matcher-0.0.dev2317948835/crossref_matcher/indexes/ror_organizations/index_data.py +293 -0
  22. crossref_matcher-0.0.dev2317948835/crossref_matcher/indexes/ror_organizations/ror_opensearch_update_dag.py +124 -0
  23. crossref_matcher-0.0.dev2317948835/crossref_matcher/indexes/ror_organizations/ror_zenodo_api.py +23 -0
  24. crossref_matcher-0.0.dev2317948835/crossref_matcher/matching/__init__.py +0 -0
  25. crossref_matcher-0.0.dev2317948835/crossref_matcher/matching/http_utils.py +14 -0
  26. crossref_matcher-0.0.dev2317948835/crossref_matcher/matching/run_strategy_on_dataset.py +78 -0
  27. crossref_matcher-0.0.dev2317948835/crossref_matcher/matching/schemas.py +81 -0
  28. crossref_matcher-0.0.dev2317948835/crossref_matcher/matching/search.py +42 -0
  29. crossref_matcher-0.0.dev2317948835/crossref_matcher/matching/utils.py +24 -0
  30. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/VERSION.txt +1 -0
  31. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/__init__.py +0 -0
  32. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/affiliation/__init__.py +0 -0
  33. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/__init__.py +0 -0
  34. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/common_ror_endings/__init__.py +0 -0
  35. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/common_ror_endings/common_ror_endings.txt +46 -0
  36. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/countries/__init__.py +0 -0
  37. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/countries/china_placenames.txt +363 -0
  38. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/countries/countries.txt +590 -0
  39. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/countries/countries2.txt +923 -0
  40. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/datasets/__init__.py +0 -0
  41. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/__init__.py +0 -0
  42. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ensemble_ml/__init__.py +0 -0
  43. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ensemble_ml/data_counters0.pkl +1 -0
  44. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ensemble_ml/eval/events.out.tfevents.1690572611.knopfler.local +0 -0
  45. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ensemble_ml/model.opt_slots0.npy.gz +0 -0
  46. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ensemble_ml/model.pkl.gz +0 -0
  47. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ensemble_ml/model.weights.npy.gz +0 -0
  48. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ensemble_ml/train/events.out.tfevents.1690572611.knopfler.local +0 -0
  49. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ml/data_counters0.pkl +1 -0
  50. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ml/eval/events.out.tfevents.1690302892.knopfler.local +0 -0
  51. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ml/model.opt_slots0.npy.gz +0 -0
  52. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ml/model.pkl.gz +0 -0
  53. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ml/model.weights.npy.gz +0 -0
  54. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/preprint_ml/train/events.out.tfevents.1690302891.knopfler.local +0 -0
  55. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/reference_nn_unstructured/__init__.py +0 -0
  56. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/reference_nn_unstructured/data_counters0.pkl +1 -0
  57. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/reference_nn_unstructured/eval/events.out.tfevents.1681511550.knopfler.local +0 -0
  58. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/reference_nn_unstructured/model.opt_slots0.npy.gz +0 -0
  59. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/reference_nn_unstructured/model.pkl.gz +0 -0
  60. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/reference_nn_unstructured/model.weights.npy.gz +0 -0
  61. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/data/models/reference_nn_unstructured/train/events.out.tfevents.1681511550.knopfler.local +0 -0
  62. crossref_matcher-0.0.dev2317948835/crossref_matcher/resources/plugins.json +34 -0
  63. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/__init__.py +389 -0
  64. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/affiliation/__init__.py +0 -0
  65. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/affiliation/es_utils.py +46 -0
  66. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/affiliation/matching.py +128 -0
  67. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/affiliation/multi_search/__init__.py +0 -0
  68. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/affiliation/multi_search/strategy.py +126 -0
  69. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/affiliation/single_search/__init__.py +0 -0
  70. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/affiliation/single_search/strategy.py +248 -0
  71. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/affiliation/strategy.py +11 -0
  72. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/common.py +96 -0
  73. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/funder/__init__.py +0 -0
  74. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/funder/funder_name_to_ror_search/__init__.py +0 -0
  75. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/funder/funder_name_to_ror_search/strategy.py +418 -0
  76. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/funder/funder_single_search_01/__init__.py +0 -0
  77. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/funder/funder_single_search_01/strategy.py +481 -0
  78. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/funder/funder_single_search_02/__init__.py +0 -0
  79. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/funder/funder_single_search_02/strategy.py +520 -0
  80. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/healthcheck/__init__.py +0 -0
  81. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/healthcheck/healthcheck/__init__.py +0 -0
  82. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/healthcheck/healthcheck/strategy.py +17 -0
  83. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/__init__.py +0 -0
  84. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/ensemble_avg/__init__.py +0 -0
  85. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/ensemble_avg/strategy.py +70 -0
  86. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/ensemble_cascade/__init__.py +0 -0
  87. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/ensemble_cascade/strategy.py +60 -0
  88. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/ensemble_ml/__init__.py +0 -0
  89. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/ensemble_ml/get_train_data.py +126 -0
  90. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/ensemble_ml/strategy.py +89 -0
  91. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/ensemble_ml/train.py +108 -0
  92. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/ensemble_ml/utils.py +37 -0
  93. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/preprint_ml/__init__.py +0 -0
  94. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/preprint_ml/get_train_data.py +116 -0
  95. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/preprint_ml/strategy.py +86 -0
  96. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/preprint_ml/train.py +108 -0
  97. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/preprint_ml/utils.py +124 -0
  98. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/sbmv/__init__.py +0 -0
  99. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/sbmv/strategy.py +163 -0
  100. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/sbmv_es/__init__.py +0 -0
  101. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/sbmv_es/strategy.py +163 -0
  102. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/sbmv_rels/__init__.py +0 -0
  103. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/preprint/sbmv_rels/strategy.py +65 -0
  104. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/__init__.py +0 -0
  105. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/cs_combined/__init__.py +0 -0
  106. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/cs_combined/strategy.py +13 -0
  107. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/nn_unstructured/__init__.py +0 -0
  108. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/nn_unstructured/get_train_data.py +62 -0
  109. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/nn_unstructured/strategy.py +69 -0
  110. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/nn_unstructured/train.py +71 -0
  111. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/nn_unstructured/utils.py +185 -0
  112. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/parsed_doi/__init__.py +0 -0
  113. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/parsed_doi/strategy.py +40 -0
  114. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/parsed_isbn/__init__.py +0 -0
  115. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/parsed_isbn/strategy.py +32 -0
  116. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/sbmv/__init__.py +0 -0
  117. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/sbmv/strategy.py +67 -0
  118. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/sbmv_orig/__init__.py +0 -0
  119. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/sbmv_orig/strategy.py +229 -0
  120. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/sbmv_unstructured/__init__.py +0 -0
  121. crossref_matcher-0.0.dev2317948835/crossref_matcher/strategies/reference/sbmv_unstructured/strategy.py +252 -0
  122. crossref_matcher-0.0.dev2317948835/crossref_matcher.egg-info/PKG-INFO +34 -0
  123. crossref_matcher-0.0.dev2317948835/crossref_matcher.egg-info/SOURCES.txt +127 -0
  124. crossref_matcher-0.0.dev2317948835/crossref_matcher.egg-info/dependency_links.txt +1 -0
  125. crossref_matcher-0.0.dev2317948835/crossref_matcher.egg-info/entry_points.txt +2 -0
  126. crossref_matcher-0.0.dev2317948835/crossref_matcher.egg-info/requires.txt +24 -0
  127. crossref_matcher-0.0.dev2317948835/crossref_matcher.egg-info/top_level.txt +1 -0
  128. crossref_matcher-0.0.dev2317948835/setup.cfg +4 -0
  129. crossref_matcher-0.0.dev2317948835/setup.py +72 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Crossref
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ include crossref_matcher/resources/plugins.json
2
+ include crossref_matcher/resources/VERSION.txt
3
+ recursive-include crossref_matcher/resources/data/models *.pkl *.gz *.local
4
+ recursive-include crossref_matcher/resources/data/common_ror_endings *.txt
5
+ recursive-include crossref_matcher/resources/data/countries *.txt
@@ -0,0 +1,34 @@
1
+ Metadata-Version: 2.4
2
+ Name: crossref-matcher
3
+ Version: 0.0.dev2317948835
4
+ Summary: Crossref's matching API
5
+ Home-page: https://gitlab.com/crossref/marple
6
+ Author: Crossref
7
+ License-File: LICENSE
8
+ Requires-Dist: fastapi>=0.100.0
9
+ Requires-Dist: uvicorn>=0.30.0
10
+ Requires-Dist: pydantic>=2.0.0
11
+ Requires-Dist: crossref-example-matching-strategy>=0.2.1
12
+ Provides-Extra: evaluation
13
+ Requires-Dist: scikit-learn>=1.7.0; extra == "evaluation"
14
+ Provides-Extra: strategies-core
15
+ Requires-Dist: boto3>=1.30.0; extra == "strategies-core"
16
+ Requires-Dist: opensearch-py>=2.0.0; extra == "strategies-core"
17
+ Requires-Dist: opensearch-dsl>=2.0.0; extra == "strategies-core"
18
+ Requires-Dist: rapidfuzz>=3.0.0; extra == "strategies-core"
19
+ Requires-Dist: unidecode>=1.3.0; extra == "strategies-core"
20
+ Requires-Dist: fuzzywuzzy>=0.17.0; extra == "strategies-core"
21
+ Requires-Dist: geonamescache>=1.3.0; extra == "strategies-core"
22
+ Requires-Dist: requests>=2.30.0; extra == "strategies-core"
23
+ Requires-Dist: ratelimit==2.2.0; extra == "strategies-core"
24
+ Provides-Extra: test
25
+ Requires-Dist: pytest>=8.4.2; extra == "test"
26
+ Requires-Dist: anyio>=4.10.0; extra == "test"
27
+ Requires-Dist: httpx>=0.28.1; extra == "test"
28
+ Requires-Dist: jsonschema>=4.25.1; extra == "test"
29
+ Dynamic: author
30
+ Dynamic: home-page
31
+ Dynamic: license-file
32
+ Dynamic: provides-extra
33
+ Dynamic: requires-dist
34
+ Dynamic: summary
@@ -0,0 +1,107 @@
1
+ # Marple
2
+
3
+ Crossref's matching service—codenamed Marple—provides the functionality for metadata matching:
4
+
5
+ * run multiple matching tasks
6
+ * implement multiple matching strategies
7
+ * create and populate backend indexes for the matching strategies
8
+ * evaluate the strategies against ground truth datasets
9
+
10
+ ## Terminology
11
+
12
+ In short, matching is the task or process of finding an identifier of an item based on a structured or unstructured “description” of it. Examples include:
13
+
14
+ * finding the DOI based on a bibliographic reference,
15
+ * finding the ROR ID based on an affiliation string,
16
+ * finding the grant DOI based on the acknowledgment section of a paper.
17
+
18
+ There are also tasks that are technically not matching but are closely related, and in practice, we often treat them as matching tasks:
19
+
20
+ * finding a duplicate of a journal article,
21
+ * linking preprints to journal articles.
22
+
23
+ And there are a few tasks that are often included in matching conversations, but are definitely not matching, for example:
24
+
25
+ * retrieving the metadata of a work based on its DOI,
26
+ * retrieving all works that contain the phrase “citation parsing” in the title.
27
+
28
+ A matching task defines the nature of matching. Example matching tasks are bibliographic reference matching, preprint matching, and affiliation matching. A matching task has input and output:
29
+
30
+ * Input is all the data needed for the matching, for example: a structured record or a list of them, unstructured text.
31
+ * Output are simply matched identifiers. Within a specific matching task, output identifiers are usually of a specific type (i.e. we match to ROR ID, and not ORCID ID). In some cases, there can be a certain target database as well (i.e. we match only to DataCite DOIs). The output identifiers can have different cardinality depending on the task, some matching tasks will allow zero, one, and/or more identifiers as a result of matching of a single input.
32
+
33
+ A matching strategy defines how the matching is actually done. Multiple strategies can exist for a specific matching task. Some strategies can even run other strategies and combine their outcomes.
34
+
35
+ ## Evaluation
36
+
37
+ We can evaluate a matching strategy using an evaluation set. See [Evaluation](crossref_matcher/evaluation/README.md) for more about this.
38
+
39
+ ## Using `crossref-matcher` as a library
40
+
41
+ ### Installation
42
+
43
+ Install with pip: `pip install crossref-matcher`
44
+
45
+ ### Usage
46
+
47
+ With the `crossref-matcher` library installed in your own project, you can import the `Strategy` class to develop your own matching strategies:
48
+
49
+ ```python
50
+ from crossref_matcher import Strategy, MatchTask
51
+ class MyMatchingStrategy(Strategy):
52
+ id = "my-matching-strategy"
53
+ task = MatchTask.REFERENCE
54
+ description = "Example strategy."
55
+
56
+ def __init__(self):
57
+ pass
58
+
59
+ def match(self, input_data):
60
+ # matching logic goes here
61
+ ```
62
+
63
+ Find more information on the `Strategy` class in [`crossref_matcher/strategies/__init__.py`](crossref_matcher/strategies/__init__.py).
64
+
65
+ ## Importing external strategies as plugins
66
+
67
+ After installing an external strategy, you can add it to [`plugins.json`](crossref_matcher/resources/plugins.json) to use it.
68
+
69
+ To see this in action, you can add the following entry to the `crossref_matcher.plugins.enabled` array in `plugins.json`:
70
+
71
+ ```json
72
+ "crossref_example_matching_strategy"
73
+ ```
74
+
75
+ This will make available the [example matching strategy](https://gitlab.com/crossref/example-matching-strategy), which should already be installed (via `setup.py` and/or `requirements.txt`).
76
+
77
+ ## Development
78
+
79
+ ### Indexes
80
+
81
+ Some strategies need an OpenSearch index to match. [indexes directory](crossref_matcher/indexes/) contains scripts for creating and populating the indexes.
82
+
83
+ Use ES_HOST env var to point Marple to the OpenSearch cluster.
84
+
85
+ ### How to run
86
+
87
+ Run
88
+
89
+ ```sh
90
+ python -m crossref_matcher.app --host 0.0.0.0 --port 8000
91
+ ```
92
+
93
+ and then visit http://localhost:8000/docs
94
+
95
+ ### Strategies
96
+
97
+ Strategies are located in the [strategies directory](crossref_matcher/strategies/). They inherit from the [`Strategy` abstract base class](crossref_matcher/strategies/__init__.py).
98
+
99
+ Strategies must be added to the "enabled" list in [plugins.json](crossref_matcher/resources/plugins.json) before they can be used.
100
+
101
+ ### Dependencies
102
+
103
+ Requirements are defined in `setup.py`. Requirements files with exact versions can be updated with `pip-compile`, which is a part of `pip-tools`. First run `pip install pip-tools`. Then:
104
+
105
+ * To update `requirements.txt` file: `pip-compile setup.py --extra strategies-core`
106
+
107
+ * To update `requirements-tests.txt` file: `pip-compile setup.py --extra strategies-core --extra evaluation --extra test -o requirements-tests.txt`
@@ -0,0 +1,36 @@
1
+ from enum import Enum
2
+
3
+
4
+ class MatchTask(Enum):
5
+ REFERENCE = (
6
+ "reference",
7
+ "Matching bibliographic references to works, such as "
8
+ + "journal articles, conference papers, etc.",
9
+ )
10
+ PREPRINT = ("preprint", "Matching journal articles to preprints.")
11
+ AFFILIATION = ("affiliation", "Matching affiliations to ROR IDs.")
12
+ FUNDER = ("funder", "Matching funders to ROR IDs.")
13
+ OTHER = ("other", "A generic matching task.")
14
+ HEALTHCHECK = (
15
+ "healthcheck",
16
+ "used internally to check that things are working properly",
17
+ )
18
+
19
+ def __new__(cls, value, description):
20
+ obj = object.__new__(cls)
21
+ obj._value_ = value
22
+ obj.description = description
23
+ return obj
24
+
25
+ @classmethod
26
+ def get_tasks(cls):
27
+ return [{"id": t.value, "description": t.description} for t in cls]
28
+
29
+ @classmethod
30
+ def get_ids(cls):
31
+ return [t.value for t in cls]
32
+
33
+ @property
34
+ def id(self):
35
+ # alias for "value"
36
+ return self.value
@@ -0,0 +1,6 @@
1
+ from .MatchTask import MatchTask as MatchTask
2
+ from .strategies import Strategy as Strategy
3
+ from crossref_matcher.matching.utils import get_resource_path
4
+
5
+ with open(get_resource_path("crossref_matcher.resources", "VERSION.txt"), "r") as f:
6
+ __version__ = f.read().strip()
@@ -0,0 +1,248 @@
1
+ import json
2
+ import time
3
+ from datetime import datetime
4
+ from starlette.responses import JSONResponse
5
+ from typing import Any
6
+ from contextlib import asynccontextmanager
7
+ from crossref_matcher import MatchTask, __version__
8
+ from crossref_matcher.evaluation.api_routes import router as evaluation_router
9
+ from crossref_matcher.matching import schemas
10
+ from crossref_matcher.strategies import (
11
+ Strategy,
12
+ get_default_strategy,
13
+ list_strategies,
14
+ get_strategy,
15
+ )
16
+
17
+ from fastapi import FastAPI, HTTPException, Request, Query
18
+ from typing import Union, Annotated
19
+
20
+ import logging
21
+
22
+ import uvicorn
23
+ import uvicorn.logging
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ # logging.getLogger("matching").setLevel(logging.DEBUG)
28
+ # handler = logging.StreamHandler()
29
+ # handler.setFormatter(
30
+ # logging.Formatter(
31
+ # fmt="%(asctime)s %(name)s.%(lineno)d %(levelname)s : %(message)s",
32
+ # datefmt="%H:%M:%S",
33
+ # )
34
+ # )
35
+ # logging.getLogger("matching").addHandler(handler)
36
+
37
+
38
+ class AsciiJSONResponse(JSONResponse):
39
+ def render(self, content: Any) -> bytes:
40
+ return json.dumps(content, ensure_ascii=True).encode("utf-8")
41
+
42
+
43
+ Strategy.load_plugins()
44
+
45
+ tags_metadata = [
46
+ {"name": "Health", "description": "Health check endpoints."},
47
+ {
48
+ "name": "Tasks and strategies",
49
+ "description": "Endpoints to list available matching tasks and strategies.",
50
+ },
51
+ {"name": "Matching", "description": "Endpoints to perform matching."},
52
+ {
53
+ "name": "Evaluation",
54
+ "description": "Endpoints to evaluate matching strategies on datasets.",
55
+ },
56
+ ]
57
+
58
+
59
+ @asynccontextmanager
60
+ async def lifespan(app: FastAPI):
61
+ # startup code goes here
62
+ yield
63
+ # shutdown code goes here
64
+
65
+
66
+ app = FastAPI(
67
+ title="Crossref Matching API",
68
+ description="Matching API allows to match structured "
69
+ "and unstructured data to identifiers.",
70
+ lifespan=lifespan,
71
+ openapi_tags=tags_metadata,
72
+ )
73
+
74
+
75
+ @app.middleware("http")
76
+ async def add_headers(request: Request, call_next):
77
+ start_time = time.perf_counter()
78
+ start_time_iso = datetime.fromtimestamp(time.time()).isoformat()
79
+ response = await call_next(request)
80
+ process_time = time.perf_counter() - start_time
81
+ response.headers["X-Start-Time"] = start_time_iso
82
+ response.headers["X-Process-Time"] = str(process_time)
83
+ response.headers["X-Service-Version"] = __version__
84
+ return response
85
+
86
+
87
+ app.include_router(evaluation_router)
88
+
89
+
90
+ @app.get(
91
+ "/heartbeat",
92
+ response_model=schemas.HeartbeatResponse,
93
+ tags=["Health"],
94
+ name="",
95
+ description="Heartbeat",
96
+ response_class=AsciiJSONResponse,
97
+ )
98
+ async def heartbeat():
99
+ return {"status": "ok"}
100
+
101
+
102
+ @app.get(
103
+ "/tasks",
104
+ response_model=schemas.TasksResponse,
105
+ tags=["Tasks and strategies"],
106
+ name="",
107
+ description="The list of supported matching tasks.",
108
+ response_class=AsciiJSONResponse,
109
+ )
110
+ async def list_tasks_endpoint():
111
+ from crossref_matcher.strategies import (
112
+ NoDefaultStrategyError,
113
+ MultipleDefaultStrategiesError,
114
+ )
115
+
116
+ tasks = MatchTask.get_tasks()
117
+ for task in tasks:
118
+ try:
119
+ default_strategy = get_default_strategy(task["id"])
120
+ task["default_strategy"] = default_strategy.id
121
+ except NoDefaultStrategyError:
122
+ if task["id"] == "other":
123
+ task["default_strategy"] = "N/A"
124
+ else:
125
+ task["default_strategy"] = "WARNING: No default strategy set"
126
+ except MultipleDefaultStrategiesError:
127
+ task["default_strategy"] = "WARNING: Multiple default strategies set"
128
+ return schemas.TasksResponse(message=schemas.TasksMessage(items=tasks))
129
+
130
+
131
+ @app.get(
132
+ "/tasks/{task_id}/strategies",
133
+ response_model=schemas.StrategiesResponse,
134
+ tags=["Tasks and strategies"],
135
+ name="",
136
+ description="The list of strategies available for a given matching task.",
137
+ response_class=AsciiJSONResponse,
138
+ )
139
+ async def list_strategies_endpoint(task_id: str, include_disabled: bool = False):
140
+ task_id = task_id.replace("-matching", "")
141
+ try:
142
+ match_task = MatchTask(task_id)
143
+ except ValueError:
144
+ raise HTTPException(status_code=404, detail="No such matching task")
145
+ strategies = list_strategies(task=match_task.id, include_disabled=include_disabled)
146
+ strategies = [
147
+ {
148
+ "id": s.id,
149
+ "description": s.description,
150
+ "default": s.is_default(),
151
+ "disabled": s.is_disabled(),
152
+ }
153
+ for s in strategies
154
+ ]
155
+ return schemas.StrategiesResponse(
156
+ message=schemas.StrategiesMessage(items=strategies)
157
+ )
158
+
159
+
160
+ def match(
161
+ task: str,
162
+ input_data: str,
163
+ strategy: Union[str, None] = None,
164
+ ):
165
+ task = task.replace("-matching", "")
166
+ try:
167
+ match_task = MatchTask(task)
168
+ except ValueError:
169
+ raise HTTPException(status_code=404, detail="No such matching task")
170
+
171
+ try:
172
+ strategy_class = get_strategy(strategy, match_task)
173
+ except ValueError as e:
174
+ raise HTTPException(status_code=404, detail=str(e))
175
+
176
+ try:
177
+ strategy_id = strategy_class.id
178
+ s = strategy_class()
179
+ items = s.match(input_data)
180
+ target_data = getattr(s, "target_data", None)
181
+ items = [schemas.MatchedItem.model_validate(i) for i in items]
182
+ return schemas.MatchedItemsResponse(
183
+ message=schemas.MatchedItemsMessage(
184
+ items=items, strategy=strategy_id, target_data=target_data
185
+ )
186
+ )
187
+ except NotImplementedError:
188
+ raise HTTPException(
189
+ status_code=400,
190
+ detail="Strategy is not implemented",
191
+ )
192
+
193
+
194
+ @app.get(
195
+ "/match",
196
+ response_model=schemas.MatchedItemsResponse,
197
+ tags=["Matching"],
198
+ name="",
199
+ description="Match input to identifiers.",
200
+ response_class=AsciiJSONResponse,
201
+ )
202
+ async def match_get(
203
+ task: str,
204
+ input_data: Annotated[str, Query(alias="input")],
205
+ strategy: Union[str, None] = None,
206
+ ):
207
+ return match(task, input_data, strategy)
208
+
209
+
210
+ @app.post(
211
+ "/match",
212
+ response_model=schemas.MatchedItemsResponse,
213
+ tags=["Matching"],
214
+ name="",
215
+ description="Match input to identifiers.",
216
+ response_class=AsciiJSONResponse,
217
+ )
218
+ async def match_post(
219
+ request: Request,
220
+ task: str,
221
+ strategy: Union[str, None] = None,
222
+ ):
223
+ input_data = (await request.body()).decode("UTF-8")
224
+ return match(task, input_data, strategy)
225
+
226
+
227
+ def run_app(host="0.0.0.0", port=8000):
228
+ uvicorn.run(
229
+ "crossref_matcher.app:app",
230
+ host=host,
231
+ port=port,
232
+ reload=True,
233
+ log_level="debug",
234
+ )
235
+
236
+
237
+ if __name__ == "__main__":
238
+ import argparse
239
+
240
+ parser = argparse.ArgumentParser(description="Run Crossref Matching API server")
241
+ parser.add_argument(
242
+ "--host", help="Host to run the server on", type=str, default="0.0.0.0"
243
+ )
244
+ parser.add_argument(
245
+ "--port", help="Port to run the server on", type=int, default=8000
246
+ )
247
+ args = parser.parse_args()
248
+ run_app(host=args.host, port=args.port)
@@ -0,0 +1,13 @@
1
+ import warnings
2
+ from .api_routes import router as router
3
+
4
+ try:
5
+ from .evaluation import (
6
+ get_matched_alternate as get_matched_alternate,
7
+ EvaluateSklearn as EvaluateSklearn,
8
+ run_strategy_on_eval_data as run_strategy_on_eval_data,
9
+ )
10
+ except ImportError as e:
11
+ warnings.warn(
12
+ f"Error when importing evaluation module. Some functionality may be unavailable: {e}"
13
+ )
@@ -0,0 +1,185 @@
1
+ from copy import deepcopy
2
+ from typing import Iterable
3
+
4
+ from crossref_matcher.evaluation.evaluation import get_matched_alternate
5
+
6
+
7
+ def res_cat(true_link, matched_link):
8
+ if matched_link == "no_match":
9
+ if true_link == "no_match":
10
+ return "TN"
11
+ else:
12
+ return "FN"
13
+ else:
14
+ if matched_link == true_link:
15
+ return "TP"
16
+ else:
17
+ return "FP"
18
+
19
+
20
+ def get_candidate_position(candidates, target) -> int | None:
21
+ if target == "no_match":
22
+ return None
23
+ if not candidates:
24
+ return None
25
+ for i, candidate in enumerate(candidates):
26
+ if candidate["id"] == target:
27
+ return i
28
+ return -1
29
+
30
+
31
+ def construct_true_relaxed_list(this_result_true, alternates_negative):
32
+ for t in this_result_true:
33
+ if t in alternates_negative:
34
+ yield "no_match"
35
+ else:
36
+ yield t
37
+
38
+
39
+ def get_matched_name(item, matched_id):
40
+ for match in item["extra"].get("matching_result", []):
41
+ if match["id"] == matched_id:
42
+ return match.get("matched_name")
43
+ return None
44
+
45
+
46
+ def construct_matched_list(this_result_true, matched, alternates_negative=None):
47
+ # get a "matched" list the same length as the true list
48
+ _matched = deepcopy(matched)
49
+ this_result_matched = []
50
+
51
+ if len(_matched) == 0:
52
+ _matched = ["no_match"]
53
+ for t in this_result_true:
54
+ if t in _matched:
55
+ this_result_matched.append(_matched.pop(_matched.index(t)))
56
+ else:
57
+ this_result_matched.append(False)
58
+ for idx, val in enumerate(this_result_matched):
59
+ # replace False values with either the false positive match or "no_match"
60
+ if val is False:
61
+ if len(_matched) > 0:
62
+ item = _matched.pop()
63
+ if alternates_negative is not None and item in alternates_negative:
64
+ # This will end up changing a true positive into a true negative.
65
+ # This is a bug, but it's a tough one to fix.
66
+ this_result_matched[idx] = "no_match"
67
+ else:
68
+ this_result_matched[idx] = item
69
+ else:
70
+ this_result_matched[idx] = "no_match"
71
+ return this_result_matched
72
+
73
+
74
+ def get_all_links(items: Iterable[dict]) -> list[dict]:
75
+ """
76
+ Convert evaluation items into a detailed flat list of link records for analysis.
77
+ These records can then be used to create a pandas DataFrame for further evaluation.
78
+
79
+ This function transforms evaluation data items into individual link records,
80
+ creating one record per true link in each item. Each record contains both
81
+ the expected (true) link and the actual (matched) link, along with metadata
82
+ for evaluation analysis.
83
+
84
+ The function handles both strict and relaxed matching scenarios:
85
+ - Strict matching: Direct comparison between true and matched links
86
+ - Relaxed matching: Accounts for alternate matches
87
+
88
+ Some of the fields included in the output records require the use of an extra_fn
89
+ function during evaluation to populate necessary metadata. See the docstring
90
+ of the `run_strategy_on_eval_data` function for details on how to implement this.
91
+
92
+ Args:
93
+ items: Iterable of evaluation data items. These can be obtained from a
94
+ list of Result objects by calling `r.model_dump() for r in results` or
95
+ from a ResultSet object by calling `r.model_dump() for r in result_set.results`
96
+
97
+ Returns:
98
+ List of link record dicts. Each record contains:
99
+ - "unique_id": Formatted unique identifier (seq_no + true_link)
100
+ - "seq_no": Original sequence number from input item
101
+ - "input": Original input text
102
+ - "true_link": The expected/true link for this record
103
+ - "matched": The link that was matched by the strategy
104
+ - "matched_relaxed": The matched link under relaxed criteria
105
+ - "res_cat": Result category (TP/TN/FP/FN) for strict matching
106
+ - "res_cat_relaxed": Result category for relaxed matching
107
+ - "matched_name": Human-readable name of the matched entity
108
+ - "target_in_initial_candidates": Position of true link in candidates
109
+ - "target_es_score": Elasticsearch score for the true link
110
+ - "weight": Weight value from original item
111
+
112
+ Note:
113
+ - Items with no expected output are treated as expecting "no_match"
114
+ - The function pairs true links with matched links, handling cases where
115
+ the counts don't match by using "no_match" or false positives
116
+ - Relaxed matching applies alternate matches as specified in the item's alternates
117
+
118
+ Example usage:
119
+ >>> # result_set is an instance of ResultSet obtained from evaluation.run_strategy_on_eval_data()
120
+ >>> data = get_all_links(r.model_dump() for r in result_set.results)
121
+ >>> import pandas as pd
122
+ >>> df = pd.DataFrame(data).set_index("unique_id")
123
+ """
124
+ data = []
125
+ for item in items:
126
+ if len(item["output"]) == 0:
127
+ this_result_true = ["no_match"]
128
+ else:
129
+ this_result_true = item["output"]
130
+ alternates_negative = [
131
+ x[1] for x in item.get("alternates", []) if x[0] == "no_match"
132
+ ]
133
+ this_result_true_relaxed = list(
134
+ construct_true_relaxed_list(this_result_true, alternates_negative)
135
+ )
136
+ this_result_matched = construct_matched_list(this_result_true, item["matched"])
137
+ this_result_matched_relaxed = construct_matched_list(
138
+ this_result_true_relaxed,
139
+ get_matched_alternate(item),
140
+ alternates_negative=alternates_negative,
141
+ )
142
+ for i, this_row_true in enumerate(this_result_true):
143
+ if this_row_true in this_result_matched:
144
+ this_row_matched = this_row_true
145
+ else:
146
+ this_row_matched = this_result_matched[0]
147
+ this_result_matched.remove(this_row_matched)
148
+ if this_result_true_relaxed[i] in this_result_matched_relaxed:
149
+ this_row_matched_relaxed = this_result_true_relaxed[i]
150
+ else:
151
+ this_row_matched_relaxed = this_result_matched_relaxed[0]
152
+ this_result_matched_relaxed.remove(this_row_matched_relaxed)
153
+ unique_id = f"""{item["seq_no"]:06}_{this_row_true}"""
154
+ target_in_initial_candidates = get_candidate_position(
155
+ item["extra"].get("initial_candidates", None), this_row_true
156
+ )
157
+ if (
158
+ target_in_initial_candidates is not None
159
+ and target_in_initial_candidates >= 0
160
+ ):
161
+ target_es_score = item["extra"]["initial_candidates"][
162
+ target_in_initial_candidates
163
+ ]["elasticsearch_score"]
164
+ else:
165
+ target_es_score = None
166
+ matched_name = get_matched_name(item, this_row_matched)
167
+ data.append(
168
+ {
169
+ "unique_id": unique_id,
170
+ "seq_no": item["seq_no"],
171
+ "input": item["input"],
172
+ "true_link": this_row_true,
173
+ "matched": this_row_matched,
174
+ "matched_relaxed": this_row_matched_relaxed,
175
+ "res_cat": res_cat(this_row_true, this_row_matched),
176
+ "res_cat_relaxed": res_cat(
177
+ this_result_true_relaxed[i], this_row_matched_relaxed
178
+ ),
179
+ "matched_name": matched_name,
180
+ "target_in_initial_candidates": target_in_initial_candidates,
181
+ "target_es_score": target_es_score,
182
+ "weight": item.get("weight"),
183
+ }
184
+ )
185
+ return data
@@ -0,0 +1,20 @@
1
+ import json
2
+ from starlette.responses import JSONResponse
3
+ from typing import Any
4
+ from fastapi import APIRouter
5
+
6
+
7
+ class AsciiJSONResponse(JSONResponse):
8
+ def render(self, content: Any) -> bytes:
9
+ return json.dumps(content, ensure_ascii=True).encode("utf-8")
10
+
11
+
12
+ router = APIRouter(
13
+ prefix="/evaluate",
14
+ tags=["Evaluation"],
15
+ )
16
+
17
+
18
+ @router.get("/", tags=["Evaluation"], response_class=AsciiJSONResponse)
19
+ async def evaluation_coming_soon():
20
+ return {"message": "Evaluation endpoints coming soon!"}