followthemoney 1.3.6__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +5 -3
- followthemoney/cli/__init__.py +17 -0
- followthemoney/cli/aggregate.py +56 -0
- followthemoney/cli/cli.py +88 -0
- followthemoney/cli/exports.py +121 -0
- followthemoney/cli/mapping.py +85 -0
- followthemoney/cli/sieve.py +67 -0
- followthemoney/cli/util.py +142 -0
- followthemoney/compare.py +132 -55
- followthemoney/exc.py +19 -6
- followthemoney/export/common.py +29 -0
- followthemoney/export/csv.py +82 -0
- followthemoney/export/excel.py +75 -0
- followthemoney/export/graph.py +79 -0
- followthemoney/export/neo4j.py +182 -0
- followthemoney/export/rdf.py +26 -0
- followthemoney/graph.py +308 -0
- followthemoney/helpers.py +212 -0
- followthemoney/mapping/__init__.py +1 -1
- followthemoney/mapping/csv.py +67 -35
- followthemoney/mapping/entity.py +116 -44
- followthemoney/mapping/property.py +90 -44
- followthemoney/mapping/query.py +27 -19
- followthemoney/mapping/source.py +15 -5
- followthemoney/mapping/sql.py +75 -61
- followthemoney/messages.py +13 -7
- followthemoney/model.py +108 -56
- followthemoney/namespace.py +119 -0
- followthemoney/offshore.py +48 -0
- followthemoney/ontology.py +77 -0
- followthemoney/property.py +204 -71
- followthemoney/proxy.py +455 -118
- followthemoney/rdf.py +9 -0
- followthemoney/schema/Address.yaml +78 -0
- followthemoney/schema/Airplane.yaml +17 -10
- followthemoney/schema/Analyzable.yaml +54 -0
- followthemoney/schema/Article.yaml +16 -0
- followthemoney/schema/Assessment.yaml +32 -0
- followthemoney/schema/Asset.yaml +10 -4
- followthemoney/schema/Associate.yaml +41 -0
- followthemoney/schema/Audio.yaml +24 -0
- followthemoney/schema/BankAccount.yaml +53 -9
- followthemoney/schema/Call.yaml +48 -0
- followthemoney/schema/CallForTenders.yaml +117 -0
- followthemoney/schema/Company.yaml +37 -12
- followthemoney/schema/Contract.yaml +41 -7
- followthemoney/schema/ContractAward.yaml +30 -11
- followthemoney/schema/CourtCase.yaml +16 -10
- followthemoney/schema/CourtCaseParty.yaml +17 -6
- followthemoney/schema/CryptoWallet.yaml +48 -0
- followthemoney/schema/Debt.yaml +37 -0
- followthemoney/schema/Directorship.yaml +17 -4
- followthemoney/schema/Document.yaml +72 -139
- followthemoney/schema/Documentation.yml +38 -0
- followthemoney/schema/EconomicActivity.yaml +32 -17
- followthemoney/schema/Email.yaml +76 -0
- followthemoney/schema/Employment.yaml +39 -0
- followthemoney/schema/Event.yaml +35 -3
- followthemoney/schema/Family.yaml +41 -0
- followthemoney/schema/Folder.yaml +13 -0
- followthemoney/schema/HyperText.yaml +21 -0
- followthemoney/schema/Identification.yaml +40 -0
- followthemoney/schema/Image.yaml +25 -0
- followthemoney/schema/Interest.yaml +3 -6
- followthemoney/schema/Interval.yaml +56 -5
- followthemoney/schema/LegalEntity.yaml +81 -20
- followthemoney/schema/License.yaml +7 -3
- followthemoney/schema/Membership.yaml +19 -4
- followthemoney/schema/Mention.yaml +54 -0
- followthemoney/schema/Message.yaml +73 -0
- followthemoney/schema/Note.yaml +23 -0
- followthemoney/schema/Occupancy.yaml +40 -0
- followthemoney/schema/Organization.yaml +38 -3
- followthemoney/schema/Ownership.yaml +16 -4
- followthemoney/schema/Package.yaml +17 -0
- followthemoney/schema/Page.yaml +43 -0
- followthemoney/schema/Pages.yaml +23 -0
- followthemoney/schema/Passport.yaml +15 -17
- followthemoney/schema/Payment.yaml +38 -7
- followthemoney/schema/Person.yaml +61 -5
- followthemoney/schema/PlainText.yaml +17 -0
- followthemoney/schema/Position.yaml +50 -0
- followthemoney/schema/Post.yaml +42 -0
- followthemoney/schema/Project.yaml +27 -0
- followthemoney/schema/ProjectParticipant.yaml +36 -0
- followthemoney/schema/PublicBody.yaml +14 -3
- followthemoney/schema/RealEstate.yaml +19 -3
- followthemoney/schema/Representation.yaml +17 -6
- followthemoney/schema/Sanction.yaml +44 -20
- followthemoney/schema/Security.yaml +59 -0
- followthemoney/schema/Similar.yaml +37 -0
- followthemoney/schema/Succession.yaml +36 -0
- followthemoney/schema/Table.yaml +32 -0
- followthemoney/schema/TaxRoll.yaml +27 -9
- followthemoney/schema/Thing.yaml +69 -13
- followthemoney/schema/Trip.yaml +42 -0
- followthemoney/schema/UnknownLink.yaml +17 -6
- followthemoney/schema/UserAccount.yaml +44 -0
- followthemoney/schema/Value.yaml +5 -1
- followthemoney/schema/Vehicle.yaml +25 -8
- followthemoney/schema/Vessel.yaml +18 -10
- followthemoney/schema/Video.yaml +20 -0
- followthemoney/schema/Workbook.yaml +18 -0
- followthemoney/schema.py +406 -135
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
- followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
- followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
- followthemoney/translations/fr/followthemoney.po +3861 -0
- followthemoney/translations/messages.pot +3021 -725
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
- followthemoney/translations/ru/followthemoney.po +4221 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
- followthemoney/types/__init__.py +35 -17
- followthemoney/types/address.py +41 -21
- followthemoney/types/checksum.py +25 -0
- followthemoney/types/common.py +233 -88
- followthemoney/types/country.py +89 -56
- followthemoney/types/date.py +59 -76
- followthemoney/types/email.py +66 -35
- followthemoney/types/entity.py +66 -13
- followthemoney/types/gender.py +66 -0
- followthemoney/types/iban.py +47 -28
- followthemoney/types/identifier.py +49 -22
- followthemoney/types/ip.py +35 -21
- followthemoney/types/json.py +58 -0
- followthemoney/types/language.py +124 -37
- followthemoney/types/mimetype.py +44 -0
- followthemoney/types/name.py +56 -12
- followthemoney/types/number.py +30 -0
- followthemoney/types/phone.py +92 -34
- followthemoney/types/registry.py +52 -0
- followthemoney/types/string.py +43 -0
- followthemoney/types/topic.py +94 -0
- followthemoney/types/url.py +39 -17
- followthemoney/util.py +139 -45
- followthemoney-3.8.0.dist-info/METADATA +153 -0
- followthemoney-3.8.0.dist-info/RECORD +157 -0
- {followthemoney-1.3.6.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
- followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
- followthemoney-1.3.6.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
- followthemoney/link.py +0 -75
- followthemoney/schema/Associate.yml +0 -19
- followthemoney/schema/Family.yml +0 -19
- followthemoney/schema/Land.yml +0 -9
- followthemoney/schema/Relationship.yaml +0 -26
- followthemoney/types/domain.py +0 -50
- followthemoney-1.3.6.dist-info/DESCRIPTION.rst +0 -3
- followthemoney-1.3.6.dist-info/METADATA +0 -39
- followthemoney-1.3.6.dist-info/RECORD +0 -108
- followthemoney-1.3.6.dist-info/entry_points.txt +0 -3
- followthemoney-1.3.6.dist-info/metadata.json +0 -1
- followthemoney-1.3.6.dist-info/namespace_packages.txt +0 -1
- followthemoney-1.3.6.dist-info/top_level.txt +0 -3
- ns/ontology.py +0 -128
- tests/types/test_addresses.py +0 -24
- tests/types/test_common.py +0 -27
- tests/types/test_countries.py +0 -21
- tests/types/test_dates.py +0 -72
- tests/types/test_domains.py +0 -23
- tests/types/test_emails.py +0 -30
- tests/types/test_entity.py +0 -16
- tests/types/test_iban.py +0 -109
- tests/types/test_identifiers.py +0 -25
- tests/types/test_ip.py +0 -26
- tests/types/test_languages.py +0 -20
- tests/types/test_names.py +0 -33
- tests/types/test_phones.py +0 -24
- tests/types/test_registry.py +0 -14
- tests/types/test_urls.py +0 -23
- {ns → followthemoney/export}/__init__.py +0 -0
- /tests/types/__init__.py → /followthemoney/py.typed +0 -0
followthemoney/types/__init__.py
CHANGED
|
@@ -1,30 +1,48 @@
|
|
|
1
|
+
from followthemoney.types.registry import Registry
|
|
1
2
|
from followthemoney.types.url import UrlType
|
|
2
3
|
from followthemoney.types.name import NameType
|
|
3
|
-
from followthemoney.types.domain import DomainType
|
|
4
4
|
from followthemoney.types.email import EmailType
|
|
5
5
|
from followthemoney.types.ip import IpType
|
|
6
|
-
from followthemoney.types.iban import IbanType
|
|
7
6
|
from followthemoney.types.address import AddressType
|
|
8
7
|
from followthemoney.types.date import DateType
|
|
9
8
|
from followthemoney.types.phone import PhoneType
|
|
10
9
|
from followthemoney.types.country import CountryType
|
|
11
10
|
from followthemoney.types.language import LanguageType
|
|
11
|
+
from followthemoney.types.mimetype import MimeType
|
|
12
|
+
from followthemoney.types.checksum import ChecksumType
|
|
12
13
|
from followthemoney.types.identifier import IdentifierType
|
|
14
|
+
from followthemoney.types.iban import IbanType
|
|
13
15
|
from followthemoney.types.entity import EntityType
|
|
14
|
-
from followthemoney.types.
|
|
16
|
+
from followthemoney.types.topic import TopicType
|
|
17
|
+
from followthemoney.types.gender import GenderType
|
|
18
|
+
from followthemoney.types.json import JsonType
|
|
19
|
+
from followthemoney.types.string import TextType
|
|
20
|
+
from followthemoney.types.string import HTMLType
|
|
21
|
+
from followthemoney.types.string import StringType
|
|
22
|
+
from followthemoney.types.number import NumberType
|
|
23
|
+
from followthemoney.types.common import PropertyType
|
|
15
24
|
|
|
16
25
|
registry = Registry()
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
26
|
+
registry.add(UrlType)
|
|
27
|
+
registry.add(NameType)
|
|
28
|
+
registry.add(EmailType)
|
|
29
|
+
registry.add(IpType)
|
|
30
|
+
registry.add(AddressType)
|
|
31
|
+
registry.add(DateType)
|
|
32
|
+
registry.add(PhoneType)
|
|
33
|
+
registry.add(CountryType)
|
|
34
|
+
registry.add(LanguageType)
|
|
35
|
+
registry.add(MimeType)
|
|
36
|
+
registry.add(ChecksumType)
|
|
37
|
+
registry.add(IdentifierType)
|
|
38
|
+
registry.add(IbanType) # TODO: remove
|
|
39
|
+
registry.add(EntityType)
|
|
40
|
+
registry.add(TopicType)
|
|
41
|
+
registry.add(GenderType)
|
|
42
|
+
registry.add(JsonType)
|
|
43
|
+
registry.add(TextType)
|
|
44
|
+
registry.add(HTMLType)
|
|
45
|
+
registry.add(StringType)
|
|
46
|
+
registry.add(NumberType)
|
|
47
|
+
|
|
48
|
+
__all__ = ["PropertyType", "registry"]
|
followthemoney/types/address.py
CHANGED
|
@@ -1,31 +1,51 @@
|
|
|
1
1
|
import re
|
|
2
|
+
from typing import Optional, TYPE_CHECKING
|
|
3
|
+
from normality import slugify
|
|
2
4
|
from normality.cleaning import collapse_spaces
|
|
3
5
|
|
|
4
6
|
from followthemoney.types.common import PropertyType
|
|
7
|
+
from followthemoney.util import defer as _
|
|
5
8
|
from followthemoney.util import dampen
|
|
6
9
|
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from followthemoney.proxy import EntityProxy
|
|
12
|
+
|
|
7
13
|
|
|
8
14
|
class AddressType(PropertyType):
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
15
|
+
"""A geographic address used to describe a location of a residence or post
|
|
16
|
+
box. There is no specified order for the sub-parts of an address (e.g. street,
|
|
17
|
+
city, postal code), and we should consider introducing an Address schema type
|
|
18
|
+
to retain fidelity in cases where address parts are specified."""
|
|
19
|
+
|
|
20
|
+
LINE_BREAKS = re.compile(r"(\r\n|\n|<BR/>|<BR>|\t|ESQ\.,|ESQ,|;)")
|
|
21
|
+
COMMATA = re.compile(r"(,\s?[,\.])")
|
|
22
|
+
name = "address"
|
|
23
|
+
group = "addresses"
|
|
24
|
+
label = _("Address")
|
|
25
|
+
plural = _("Addresses")
|
|
26
|
+
matchable = True
|
|
27
|
+
pivot = True
|
|
14
28
|
|
|
15
|
-
def clean_text(
|
|
29
|
+
def clean_text(
|
|
30
|
+
self,
|
|
31
|
+
text: str,
|
|
32
|
+
fuzzy: bool = False,
|
|
33
|
+
format: Optional[str] = None,
|
|
34
|
+
proxy: Optional["EntityProxy"] = None,
|
|
35
|
+
) -> Optional[str]:
|
|
16
36
|
"""Basic clean-up."""
|
|
17
|
-
address = self.LINE_BREAKS.sub(
|
|
18
|
-
address = self.COMMATA.sub(
|
|
19
|
-
|
|
20
|
-
if len(
|
|
21
|
-
return
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
return
|
|
37
|
+
address = self.LINE_BREAKS.sub(", ", text)
|
|
38
|
+
address = self.COMMATA.sub(", ", address)
|
|
39
|
+
collapsed = collapse_spaces(address)
|
|
40
|
+
if collapsed is None or not len(collapsed):
|
|
41
|
+
return None
|
|
42
|
+
return collapsed
|
|
43
|
+
|
|
44
|
+
def _specificity(self, value: str) -> float:
|
|
45
|
+
return dampen(10, 60, value)
|
|
46
|
+
|
|
47
|
+
def node_id(self, value: str) -> Optional[str]:
|
|
48
|
+
slug = slugify(value)
|
|
49
|
+
if slug is None:
|
|
50
|
+
return None
|
|
51
|
+
return f"addr:{value}"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
2
|
+
from followthemoney.types.common import PropertyType
|
|
3
|
+
from followthemoney.util import defer as _
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ChecksumType(PropertyType):
|
|
7
|
+
"""Content hashes calculated using SHA1. Checksum references are used by
|
|
8
|
+
document-typed entities in Aleph to refer to raw data in the archive
|
|
9
|
+
(e.g. the document from which the entity is extracted).
|
|
10
|
+
|
|
11
|
+
Unfortunately, this has some security implications: in order to avoid people
|
|
12
|
+
getting access to documents for which they know the checksum, properties
|
|
13
|
+
of this type are scrubbed when submitted via the normal API. Checksums can only
|
|
14
|
+
be defined by uploading a document to be ingested."""
|
|
15
|
+
|
|
16
|
+
name = "checksum"
|
|
17
|
+
group = "checksums"
|
|
18
|
+
label = _("Checksum")
|
|
19
|
+
plural = _("Checksums")
|
|
20
|
+
matchable = True
|
|
21
|
+
pivot = True
|
|
22
|
+
max_length = 40
|
|
23
|
+
|
|
24
|
+
def rdf(self, value: str) -> Identifier:
|
|
25
|
+
return URIRef(f"hash:{value}")
|
followthemoney/types/common.py
CHANGED
|
@@ -1,131 +1,276 @@
|
|
|
1
|
+
from inspect import cleandoc
|
|
1
2
|
from itertools import product
|
|
2
|
-
from
|
|
3
|
-
from banal import ensure_list
|
|
3
|
+
from babel.core import Locale
|
|
4
|
+
from banal import ensure_list
|
|
4
5
|
from normality import stringify
|
|
6
|
+
from typing import Any, Dict, Optional, Sequence, Callable, TYPE_CHECKING, TypedDict
|
|
7
|
+
|
|
8
|
+
from followthemoney.rdf import Literal, Identifier
|
|
9
|
+
from followthemoney.util import get_locale
|
|
10
|
+
from followthemoney.util import gettext, sanitize_text
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from followthemoney.proxy import EntityProxy
|
|
14
|
+
|
|
15
|
+
EnumValues = Dict[str, str]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PropertyTypeToDict(TypedDict, total=False):
|
|
19
|
+
label: str
|
|
20
|
+
plural: str
|
|
21
|
+
description: Optional[str]
|
|
22
|
+
maxLength: int
|
|
23
|
+
group: Optional[str]
|
|
24
|
+
matchable: Optional[bool]
|
|
25
|
+
pivot: Optional[bool]
|
|
26
|
+
values: Optional[EnumValues]
|
|
5
27
|
|
|
6
28
|
|
|
7
29
|
class PropertyType(object):
|
|
8
|
-
"""Base class for all types."""
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
30
|
+
"""Base class for all property types."""
|
|
31
|
+
|
|
32
|
+
name: str = "any"
|
|
33
|
+
"""A machine-facing, variable safe name for the given type."""
|
|
34
|
+
|
|
35
|
+
group: Optional[str] = None
|
|
36
|
+
"""Groups are used to invert all the properties of an entity that have a
|
|
37
|
+
given type into a single list before indexing them. This way, in Aleph,
|
|
38
|
+
you can query for ``countries:gb`` instead of having to make a set of filters
|
|
39
|
+
like ``properties.jurisdiction:gb OR properties.country:gb OR ...``."""
|
|
40
|
+
|
|
41
|
+
label: str = "Any"
|
|
42
|
+
"""A name for this type to be shown to users."""
|
|
43
|
+
|
|
44
|
+
plural: str = "Any"
|
|
45
|
+
"""A plural name for this type which can be used in appropriate places in
|
|
46
|
+
a user interface."""
|
|
47
|
+
|
|
48
|
+
matchable: bool = True
|
|
49
|
+
"""Matchable types allow properties to be compared with each other in order to
|
|
50
|
+
assess entity similarity. While it makes sense to compare names, countries or
|
|
51
|
+
phone numbers, the same isn't true for raw JSON blobs or descriptive text
|
|
52
|
+
snippets."""
|
|
53
|
+
|
|
54
|
+
pivot: bool = False
|
|
55
|
+
"""Pivot property types are like a stronger form of :attr:`~matchable` types:
|
|
56
|
+
they will be used when value-based lookups are used to find commonalities
|
|
57
|
+
between entities. For example, pivot typed-properties are used to show all the
|
|
58
|
+
other entities that mention the same phone number, email address or name as the
|
|
59
|
+
one currently seen by the user."""
|
|
60
|
+
|
|
61
|
+
max_length: int = 250
|
|
62
|
+
"""The maximum length of a single value of this type. This is used to warn when
|
|
63
|
+
adding individual values that may be malformed or too long to be stored in
|
|
64
|
+
downstream databases with fixed column lengths. The unit is unicode codepoints
|
|
65
|
+
(not bytes), the output of Python len()."""
|
|
66
|
+
|
|
67
|
+
total_size: Optional[int] = None
|
|
68
|
+
"""Some types have overall size limitations in place in order to avoid generating
|
|
69
|
+
entities that are very large (upstream ElasticSearch has a 100MB document limit).
|
|
70
|
+
Once the total size of all properties of this type has exceed the given limit,
|
|
71
|
+
an entity will refuse to add further values."""
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def docs(self) -> Optional[str]:
|
|
75
|
+
if not self.__doc__:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
return cleandoc(self.__doc__)
|
|
79
|
+
|
|
80
|
+
def validate(
|
|
81
|
+
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
82
|
+
) -> bool:
|
|
83
|
+
"""Returns a boolean to indicate if the given value is a valid instance of
|
|
16
84
|
the type."""
|
|
17
|
-
cleaned = self.clean(
|
|
85
|
+
cleaned = self.clean(value, fuzzy=fuzzy, format=format)
|
|
18
86
|
return cleaned is not None
|
|
19
87
|
|
|
20
|
-
def clean(
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
88
|
+
def clean(
|
|
89
|
+
self,
|
|
90
|
+
raw: Any,
|
|
91
|
+
fuzzy: bool = False,
|
|
92
|
+
format: Optional[str] = None,
|
|
93
|
+
proxy: Optional["EntityProxy"] = None,
|
|
94
|
+
) -> Optional[str]:
|
|
95
|
+
"""Create a clean version of a value of the type, suitable for storage
|
|
96
|
+
in an entity proxy."""
|
|
97
|
+
text = sanitize_text(raw)
|
|
98
|
+
if text is None:
|
|
99
|
+
return None
|
|
100
|
+
return self.clean_text(text, fuzzy=fuzzy, format=format, proxy=proxy)
|
|
26
101
|
|
|
27
|
-
def clean_text(
|
|
102
|
+
def clean_text(
|
|
103
|
+
self,
|
|
104
|
+
text: str,
|
|
105
|
+
fuzzy: bool = False,
|
|
106
|
+
format: Optional[str] = None,
|
|
107
|
+
proxy: Optional["EntityProxy"] = None,
|
|
108
|
+
) -> Optional[str]:
|
|
109
|
+
"""Specific types can apply their own cleaning routines here (this is called
|
|
110
|
+
by ``clean`` after the value has been converted to a string and null values
|
|
111
|
+
have been filtered)."""
|
|
28
112
|
return text
|
|
29
113
|
|
|
30
|
-
def
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
return
|
|
36
|
-
|
|
37
|
-
def
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
for
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
114
|
+
def join(self, values: Sequence[str]) -> str:
|
|
115
|
+
"""Helper function for converting multi-valued FtM data into formats that
|
|
116
|
+
allow only a single value per field (e.g. CSV). This is not fully reversible
|
|
117
|
+
and should be used as a last option."""
|
|
118
|
+
values = ensure_list(values)
|
|
119
|
+
return "; ".join(values)
|
|
120
|
+
|
|
121
|
+
def _specificity(self, value: str) -> float:
|
|
122
|
+
return 1.0
|
|
123
|
+
|
|
124
|
+
def specificity(self, value: Optional[str]) -> float:
|
|
125
|
+
"""Return a score for how specific the given value is. This can be used as a
|
|
126
|
+
weighting factor in entity comparisons in order to rate matching property
|
|
127
|
+
values by how specific they are. For example: a longer address is considered
|
|
128
|
+
to be more specific than a short one, a full date more specific than just a
|
|
129
|
+
year number, etc."""
|
|
130
|
+
if not self.matchable or value is None:
|
|
131
|
+
return 0.0
|
|
132
|
+
return self._specificity(value)
|
|
133
|
+
|
|
134
|
+
def compare_safe(self, left: Optional[str], right: Optional[str]) -> float:
|
|
135
|
+
"""Compare, but support None values on either side of the comparison."""
|
|
49
136
|
left = stringify(left)
|
|
50
137
|
right = stringify(right)
|
|
51
138
|
if left is None or right is None:
|
|
52
|
-
return 0
|
|
139
|
+
return 0.0
|
|
53
140
|
return self.compare(left, right)
|
|
54
141
|
|
|
55
|
-
def compare(self, left, right):
|
|
142
|
+
def compare(self, left: str, right: str) -> float:
|
|
56
143
|
"""Comparisons are a float between 0 and 1. They can assume
|
|
57
144
|
that the given data is cleaned, but not normalised."""
|
|
58
145
|
if left.lower() == right.lower():
|
|
59
|
-
return 1 * self.specificity(left)
|
|
60
|
-
return 0
|
|
146
|
+
return 1.0 * self.specificity(left)
|
|
147
|
+
return 0.0
|
|
61
148
|
|
|
62
|
-
def compare_sets(
|
|
63
|
-
|
|
149
|
+
def compare_sets(
|
|
150
|
+
self,
|
|
151
|
+
left: Sequence[str],
|
|
152
|
+
right: Sequence[str],
|
|
153
|
+
func: Callable[[Sequence[float]], float] = max,
|
|
154
|
+
) -> float:
|
|
155
|
+
"""Compare two sets of values and select the highest-scored result."""
|
|
64
156
|
results = []
|
|
65
|
-
for
|
|
66
|
-
results.append(self.
|
|
157
|
+
for le, ri in product(ensure_list(left), ensure_list(right)):
|
|
158
|
+
results.append(self.compare(le, ri))
|
|
67
159
|
if not len(results):
|
|
68
|
-
return 0
|
|
160
|
+
return 0.0
|
|
69
161
|
return func(results)
|
|
70
162
|
|
|
71
|
-
def country_hint(self, value):
|
|
72
|
-
"""Determine if the given value allows us to infer a country
|
|
73
|
-
|
|
163
|
+
def country_hint(self, value: str) -> Optional[str]:
|
|
164
|
+
"""Determine if the given value allows us to infer a country that it may
|
|
165
|
+
be related to (e.g. using a country prefix on a phone number or IBAN)."""
|
|
74
166
|
return None
|
|
75
167
|
|
|
76
|
-
def
|
|
77
|
-
"""
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
value
|
|
168
|
+
def rdf(self, value: str) -> Identifier:
|
|
169
|
+
"""Return an RDF term to represent the given value - either a string
|
|
170
|
+
literal, or a URI reference."""
|
|
171
|
+
return Literal(value)
|
|
172
|
+
|
|
173
|
+
def pick(self, values: Sequence[str]) -> Optional[str]:
|
|
174
|
+
"""Pick the best value to show to the user."""
|
|
175
|
+
raise NotImplementedError
|
|
176
|
+
|
|
177
|
+
def node_id(self, value: str) -> Optional[str]:
|
|
178
|
+
"""Return an ID suitable to identify this entity as a typed node in a
|
|
179
|
+
graph representation of some FtM data. It's usually the same as the the
|
|
180
|
+
RDF form."""
|
|
181
|
+
return str(self.rdf(value))
|
|
182
|
+
|
|
183
|
+
def node_id_safe(self, value: Optional[str]) -> Optional[str]:
|
|
184
|
+
"""Wrapper for node_id to handle None values."""
|
|
83
185
|
if value is None:
|
|
84
|
-
return
|
|
85
|
-
return
|
|
186
|
+
return None
|
|
187
|
+
return self.node_id(value)
|
|
86
188
|
|
|
87
|
-
def
|
|
88
|
-
|
|
189
|
+
def caption(self, value: str) -> Optional[str]:
|
|
190
|
+
"""Return a label for the given property value. This is often the same as the
|
|
191
|
+
value, but for types like countries or languages, it would return the label,
|
|
192
|
+
while other values like phone numbers can be formatted to be nicer to read."""
|
|
193
|
+
return value
|
|
89
194
|
|
|
90
|
-
def
|
|
195
|
+
def to_dict(self) -> PropertyTypeToDict:
|
|
196
|
+
"""Return a serialisable description of this data type."""
|
|
197
|
+
data: PropertyTypeToDict = {
|
|
198
|
+
"label": gettext(self.label),
|
|
199
|
+
"plural": gettext(self.plural),
|
|
200
|
+
"description": gettext(self.docs),
|
|
201
|
+
"maxLength": self.max_length,
|
|
202
|
+
}
|
|
203
|
+
if self.group:
|
|
204
|
+
data["group"] = self.group
|
|
205
|
+
if self.matchable:
|
|
206
|
+
data["matchable"] = True
|
|
207
|
+
if self.pivot:
|
|
208
|
+
data["pivot"] = True
|
|
209
|
+
return data
|
|
210
|
+
|
|
211
|
+
def __eq__(self, other: Any) -> bool:
|
|
212
|
+
if not isinstance(other, PropertyType):
|
|
213
|
+
return False
|
|
91
214
|
return self.name == other.name
|
|
92
215
|
|
|
93
|
-
def __hash__(self):
|
|
216
|
+
def __hash__(self) -> int:
|
|
94
217
|
return hash(self.name)
|
|
95
218
|
|
|
96
|
-
def __str__(self):
|
|
219
|
+
def __str__(self) -> str:
|
|
97
220
|
return self.name
|
|
98
221
|
|
|
99
|
-
def __repr__(self):
|
|
100
|
-
return
|
|
222
|
+
def __repr__(self) -> str:
|
|
223
|
+
return f"<{self.name}>"
|
|
224
|
+
|
|
101
225
|
|
|
226
|
+
class EnumType(PropertyType):
|
|
227
|
+
"""Enumerated type properties are used for types which have a defined set
|
|
228
|
+
of possible values, like languages and countries."""
|
|
102
229
|
|
|
103
|
-
|
|
104
|
-
|
|
230
|
+
def __init__(self) -> None:
|
|
231
|
+
self._names: Dict[Locale, EnumValues] = {}
|
|
232
|
+
self.codes = set(self.names.keys())
|
|
105
233
|
|
|
234
|
+
def _locale_names(self, locale: Locale) -> EnumValues:
|
|
235
|
+
return {}
|
|
106
236
|
|
|
107
|
-
|
|
237
|
+
@property
|
|
238
|
+
def names(self) -> EnumValues:
|
|
239
|
+
"""Return a mapping from property values to their labels in the current
|
|
240
|
+
locale."""
|
|
241
|
+
locale = get_locale()
|
|
242
|
+
if locale not in self._names:
|
|
243
|
+
self._names[locale] = self._locale_names(locale)
|
|
244
|
+
return self._names[locale]
|
|
108
245
|
|
|
109
|
-
def
|
|
110
|
-
self
|
|
111
|
-
|
|
112
|
-
|
|
246
|
+
def validate(
|
|
247
|
+
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
248
|
+
) -> bool:
|
|
249
|
+
"""Make sure that the given code value is one of the supported set."""
|
|
250
|
+
if value is None:
|
|
251
|
+
return False
|
|
252
|
+
return str(value).lower().strip() in self.codes
|
|
113
253
|
|
|
114
|
-
def
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
254
|
+
def clean_text(
|
|
255
|
+
self,
|
|
256
|
+
code: str,
|
|
257
|
+
fuzzy: bool = False,
|
|
258
|
+
format: Optional[str] = None,
|
|
259
|
+
proxy: Optional["EntityProxy"] = None,
|
|
260
|
+
) -> Optional[str]:
|
|
261
|
+
"""All code values are cleaned to be lowercase and trailing whitespace is
|
|
262
|
+
removed."""
|
|
263
|
+
code = code.lower().strip()
|
|
264
|
+
if code not in self.codes:
|
|
265
|
+
return None
|
|
266
|
+
return code
|
|
122
267
|
|
|
123
|
-
def
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
except AttributeError:
|
|
127
|
-
pass
|
|
268
|
+
def caption(self, value: str) -> str:
|
|
269
|
+
"""Given a code value, return the label that should be shown to a user."""
|
|
270
|
+
return self.names.get(value, value)
|
|
128
271
|
|
|
129
|
-
def
|
|
130
|
-
|
|
131
|
-
|
|
272
|
+
def to_dict(self) -> PropertyTypeToDict:
|
|
273
|
+
"""When serialising the model to JSON, include all values."""
|
|
274
|
+
data = super(EnumType, self).to_dict()
|
|
275
|
+
data["values"] = self.names
|
|
276
|
+
return data
|