followthemoney 1.3.6__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- followthemoney/__init__.py +5 -3
- followthemoney/cli/__init__.py +17 -0
- followthemoney/cli/aggregate.py +56 -0
- followthemoney/cli/cli.py +88 -0
- followthemoney/cli/exports.py +121 -0
- followthemoney/cli/mapping.py +85 -0
- followthemoney/cli/sieve.py +67 -0
- followthemoney/cli/util.py +142 -0
- followthemoney/compare.py +132 -55
- followthemoney/exc.py +19 -6
- followthemoney/export/common.py +29 -0
- followthemoney/export/csv.py +82 -0
- followthemoney/export/excel.py +75 -0
- followthemoney/export/graph.py +79 -0
- followthemoney/export/neo4j.py +182 -0
- followthemoney/export/rdf.py +26 -0
- followthemoney/graph.py +308 -0
- followthemoney/helpers.py +212 -0
- followthemoney/mapping/__init__.py +1 -1
- followthemoney/mapping/csv.py +67 -35
- followthemoney/mapping/entity.py +116 -44
- followthemoney/mapping/property.py +90 -44
- followthemoney/mapping/query.py +27 -19
- followthemoney/mapping/source.py +15 -5
- followthemoney/mapping/sql.py +75 -61
- followthemoney/messages.py +13 -7
- followthemoney/model.py +108 -56
- followthemoney/namespace.py +119 -0
- followthemoney/offshore.py +48 -0
- followthemoney/ontology.py +77 -0
- followthemoney/property.py +204 -71
- followthemoney/proxy.py +455 -118
- followthemoney/rdf.py +9 -0
- followthemoney/schema/Address.yaml +78 -0
- followthemoney/schema/Airplane.yaml +17 -10
- followthemoney/schema/Analyzable.yaml +54 -0
- followthemoney/schema/Article.yaml +16 -0
- followthemoney/schema/Assessment.yaml +32 -0
- followthemoney/schema/Asset.yaml +10 -4
- followthemoney/schema/Associate.yaml +41 -0
- followthemoney/schema/Audio.yaml +24 -0
- followthemoney/schema/BankAccount.yaml +53 -9
- followthemoney/schema/Call.yaml +48 -0
- followthemoney/schema/CallForTenders.yaml +117 -0
- followthemoney/schema/Company.yaml +37 -12
- followthemoney/schema/Contract.yaml +41 -7
- followthemoney/schema/ContractAward.yaml +30 -11
- followthemoney/schema/CourtCase.yaml +16 -10
- followthemoney/schema/CourtCaseParty.yaml +17 -6
- followthemoney/schema/CryptoWallet.yaml +48 -0
- followthemoney/schema/Debt.yaml +37 -0
- followthemoney/schema/Directorship.yaml +17 -4
- followthemoney/schema/Document.yaml +72 -139
- followthemoney/schema/Documentation.yml +38 -0
- followthemoney/schema/EconomicActivity.yaml +32 -17
- followthemoney/schema/Email.yaml +76 -0
- followthemoney/schema/Employment.yaml +39 -0
- followthemoney/schema/Event.yaml +35 -3
- followthemoney/schema/Family.yaml +41 -0
- followthemoney/schema/Folder.yaml +13 -0
- followthemoney/schema/HyperText.yaml +21 -0
- followthemoney/schema/Identification.yaml +40 -0
- followthemoney/schema/Image.yaml +25 -0
- followthemoney/schema/Interest.yaml +3 -6
- followthemoney/schema/Interval.yaml +56 -5
- followthemoney/schema/LegalEntity.yaml +81 -20
- followthemoney/schema/License.yaml +7 -3
- followthemoney/schema/Membership.yaml +19 -4
- followthemoney/schema/Mention.yaml +54 -0
- followthemoney/schema/Message.yaml +73 -0
- followthemoney/schema/Note.yaml +23 -0
- followthemoney/schema/Occupancy.yaml +40 -0
- followthemoney/schema/Organization.yaml +38 -3
- followthemoney/schema/Ownership.yaml +16 -4
- followthemoney/schema/Package.yaml +17 -0
- followthemoney/schema/Page.yaml +43 -0
- followthemoney/schema/Pages.yaml +23 -0
- followthemoney/schema/Passport.yaml +15 -17
- followthemoney/schema/Payment.yaml +38 -7
- followthemoney/schema/Person.yaml +61 -5
- followthemoney/schema/PlainText.yaml +17 -0
- followthemoney/schema/Position.yaml +50 -0
- followthemoney/schema/Post.yaml +42 -0
- followthemoney/schema/Project.yaml +27 -0
- followthemoney/schema/ProjectParticipant.yaml +36 -0
- followthemoney/schema/PublicBody.yaml +14 -3
- followthemoney/schema/RealEstate.yaml +19 -3
- followthemoney/schema/Representation.yaml +17 -6
- followthemoney/schema/Sanction.yaml +44 -20
- followthemoney/schema/Security.yaml +59 -0
- followthemoney/schema/Similar.yaml +37 -0
- followthemoney/schema/Succession.yaml +36 -0
- followthemoney/schema/Table.yaml +32 -0
- followthemoney/schema/TaxRoll.yaml +27 -9
- followthemoney/schema/Thing.yaml +69 -13
- followthemoney/schema/Trip.yaml +42 -0
- followthemoney/schema/UnknownLink.yaml +17 -6
- followthemoney/schema/UserAccount.yaml +44 -0
- followthemoney/schema/Value.yaml +5 -1
- followthemoney/schema/Vehicle.yaml +25 -8
- followthemoney/schema/Vessel.yaml +18 -10
- followthemoney/schema/Video.yaml +20 -0
- followthemoney/schema/Workbook.yaml +18 -0
- followthemoney/schema.py +406 -135
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
- followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
- followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
- followthemoney/translations/fr/followthemoney.po +3861 -0
- followthemoney/translations/messages.pot +3021 -725
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
- followthemoney/translations/ru/followthemoney.po +4221 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
- followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
- followthemoney/types/__init__.py +35 -17
- followthemoney/types/address.py +41 -21
- followthemoney/types/checksum.py +25 -0
- followthemoney/types/common.py +233 -88
- followthemoney/types/country.py +89 -56
- followthemoney/types/date.py +59 -76
- followthemoney/types/email.py +66 -35
- followthemoney/types/entity.py +66 -13
- followthemoney/types/gender.py +66 -0
- followthemoney/types/iban.py +47 -28
- followthemoney/types/identifier.py +49 -22
- followthemoney/types/ip.py +35 -21
- followthemoney/types/json.py +58 -0
- followthemoney/types/language.py +124 -37
- followthemoney/types/mimetype.py +44 -0
- followthemoney/types/name.py +56 -12
- followthemoney/types/number.py +30 -0
- followthemoney/types/phone.py +92 -34
- followthemoney/types/registry.py +52 -0
- followthemoney/types/string.py +43 -0
- followthemoney/types/topic.py +94 -0
- followthemoney/types/url.py +39 -17
- followthemoney/util.py +139 -45
- followthemoney-3.8.0.dist-info/METADATA +153 -0
- followthemoney-3.8.0.dist-info/RECORD +157 -0
- {followthemoney-1.3.6.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
- followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
- followthemoney-1.3.6.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
- followthemoney/link.py +0 -75
- followthemoney/schema/Associate.yml +0 -19
- followthemoney/schema/Family.yml +0 -19
- followthemoney/schema/Land.yml +0 -9
- followthemoney/schema/Relationship.yaml +0 -26
- followthemoney/types/domain.py +0 -50
- followthemoney-1.3.6.dist-info/DESCRIPTION.rst +0 -3
- followthemoney-1.3.6.dist-info/METADATA +0 -39
- followthemoney-1.3.6.dist-info/RECORD +0 -108
- followthemoney-1.3.6.dist-info/entry_points.txt +0 -3
- followthemoney-1.3.6.dist-info/metadata.json +0 -1
- followthemoney-1.3.6.dist-info/namespace_packages.txt +0 -1
- followthemoney-1.3.6.dist-info/top_level.txt +0 -3
- ns/ontology.py +0 -128
- tests/types/test_addresses.py +0 -24
- tests/types/test_common.py +0 -27
- tests/types/test_countries.py +0 -21
- tests/types/test_dates.py +0 -72
- tests/types/test_domains.py +0 -23
- tests/types/test_emails.py +0 -30
- tests/types/test_entity.py +0 -16
- tests/types/test_iban.py +0 -109
- tests/types/test_identifiers.py +0 -25
- tests/types/test_ip.py +0 -26
- tests/types/test_languages.py +0 -20
- tests/types/test_names.py +0 -33
- tests/types/test_phones.py +0 -24
- tests/types/test_registry.py +0 -14
- tests/types/test_urls.py +0 -23
- {ns → followthemoney/export}/__init__.py +0 -0
- /tests/types/__init__.py → /followthemoney/py.typed +0 -0
followthemoney/types/iban.py
CHANGED
|
@@ -1,39 +1,58 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
from schwifty import IBAN
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
2
|
+
from rigour.ids import IBAN
|
|
4
3
|
|
|
5
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
6
|
+
from followthemoney.util import sanitize_text, defer as _
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from followthemoney.proxy import EntityProxy
|
|
6
10
|
|
|
7
11
|
|
|
8
12
|
class IbanType(PropertyType):
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
13
|
+
"""An international bank account number, as defined in ISO 13616. IBANs are
|
|
14
|
+
managed by SWIFT used in the European SEPA payment system.
|
|
15
|
+
|
|
16
|
+
A notable aspect of IBANs is that they share a country prefix and validation
|
|
17
|
+
mechanism, but the specific length of an IBAN is dependent on the country
|
|
18
|
+
code defined in the first two characters: `NO8330001234567` and
|
|
19
|
+
`CY21002001950000357001234567` are both valid values."""
|
|
20
|
+
|
|
21
|
+
name = "iban"
|
|
22
|
+
group = "ibans"
|
|
23
|
+
label = _("IBAN")
|
|
24
|
+
plural = _("IBANs")
|
|
25
|
+
matchable = True
|
|
26
|
+
pivot = True
|
|
27
|
+
max_length = 64
|
|
28
|
+
|
|
29
|
+
def validate(
|
|
30
|
+
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
31
|
+
) -> bool:
|
|
32
|
+
text = sanitize_text(value)
|
|
33
|
+
if text is None:
|
|
23
34
|
return False
|
|
24
|
-
|
|
25
|
-
|
|
35
|
+
return IBAN.is_valid(text)
|
|
36
|
+
|
|
37
|
+
def clean_text(
|
|
38
|
+
self,
|
|
39
|
+
text: str,
|
|
40
|
+
fuzzy: bool = False,
|
|
41
|
+
format: Optional[str] = None,
|
|
42
|
+
proxy: Optional["EntityProxy"] = None,
|
|
43
|
+
) -> Optional[str]:
|
|
26
44
|
"""Create a more clean, but still user-facing version of an
|
|
27
45
|
instance of the type."""
|
|
28
|
-
return
|
|
46
|
+
return IBAN.normalize(text)
|
|
47
|
+
|
|
48
|
+
def country_hint(self, value: str) -> str:
|
|
49
|
+
return value[:2].lower()
|
|
29
50
|
|
|
30
|
-
def
|
|
31
|
-
return
|
|
51
|
+
def rdf(self, value: str) -> Identifier:
|
|
52
|
+
return URIRef(self.node_id(value))
|
|
32
53
|
|
|
33
|
-
def
|
|
34
|
-
value
|
|
35
|
-
if value is not None:
|
|
36
|
-
return value[:2].lower()
|
|
54
|
+
def node_id(self, value: str) -> str:
|
|
55
|
+
return f"iban:{value.upper()}"
|
|
37
56
|
|
|
38
|
-
def
|
|
39
|
-
return
|
|
57
|
+
def caption(self, value: str) -> str:
|
|
58
|
+
return IBAN.format(value)
|
|
@@ -1,34 +1,61 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from
|
|
2
|
+
from typing import Optional, TYPE_CHECKING
|
|
3
|
+
from rigour.ids import get_identifier_format_names, get_identifier_format
|
|
3
4
|
|
|
4
5
|
from followthemoney.types.common import PropertyType
|
|
6
|
+
from followthemoney.util import dampen, shortest, longest
|
|
7
|
+
from followthemoney.util import defer as _
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from followthemoney.proxy import EntityProxy
|
|
5
11
|
|
|
6
12
|
|
|
7
13
|
class IdentifierType(PropertyType):
|
|
8
|
-
"""Used for registration numbers
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
"""Used for registration numbers and other codes assigned by an authority
|
|
15
|
+
to identify an entity. This might include tax identifiers and statistical
|
|
16
|
+
codes.
|
|
17
|
+
|
|
18
|
+
Since identifiers are high-value criteria when comparing two entities, numbers
|
|
19
|
+
should only be modelled as identifiers if they are long enough to be meaningful.
|
|
20
|
+
Four- or five-digit industry classifiers create more noise than value."""
|
|
21
|
+
|
|
22
|
+
COMPARE_CLEAN = re.compile(r"[\W_]+")
|
|
23
|
+
name = "identifier"
|
|
24
|
+
group = "identifiers"
|
|
25
|
+
label = _("Identifier")
|
|
26
|
+
plural = _("Identifiers")
|
|
27
|
+
matchable = True
|
|
28
|
+
pivot = True
|
|
29
|
+
max_length = 64
|
|
30
|
+
|
|
31
|
+
def clean_text(
|
|
32
|
+
self,
|
|
33
|
+
text: str,
|
|
34
|
+
fuzzy: bool = False,
|
|
35
|
+
format: Optional[str] = None,
|
|
36
|
+
proxy: Optional["EntityProxy"] = None,
|
|
37
|
+
) -> Optional[str]:
|
|
38
|
+
if format in get_identifier_format_names():
|
|
39
|
+
format_ = get_identifier_format(format)
|
|
40
|
+
return format_.normalize(text)
|
|
41
|
+
return text
|
|
42
|
+
|
|
43
|
+
def clean_compare(self, value: str) -> str:
|
|
21
44
|
# TODO: should this be used for normalization?
|
|
22
|
-
value = self.COMPARE_CLEAN.sub(
|
|
45
|
+
value = self.COMPARE_CLEAN.sub("", value)
|
|
23
46
|
return value.lower()
|
|
24
47
|
|
|
25
|
-
def compare(self, left, right):
|
|
48
|
+
def compare(self, left: str, right: str) -> float:
|
|
26
49
|
left = self.clean_compare(left)
|
|
27
50
|
right = self.clean_compare(right)
|
|
28
51
|
if left == right:
|
|
29
|
-
return .
|
|
30
|
-
|
|
31
|
-
return
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
52
|
+
return 1.0
|
|
53
|
+
elif left in right or right in left:
|
|
54
|
+
return len(shortest(left, right)) / len(longest(left, right))
|
|
55
|
+
return 0.0
|
|
56
|
+
|
|
57
|
+
def _specificity(self, value: str) -> float:
|
|
58
|
+
return dampen(4, 10, value)
|
|
59
|
+
|
|
60
|
+
def node_id(self, value: str) -> str:
|
|
61
|
+
return f"id:{value}"
|
followthemoney/types/ip.py
CHANGED
|
@@ -1,36 +1,50 @@
|
|
|
1
|
-
from
|
|
2
|
-
from normality import stringify
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
3
2
|
from ipaddress import ip_address
|
|
4
3
|
|
|
5
4
|
from followthemoney.types.common import PropertyType
|
|
5
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
6
|
+
from followthemoney.util import defer as _
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from followthemoney.proxy import EntityProxy
|
|
6
10
|
|
|
7
11
|
|
|
8
12
|
class IpType(PropertyType):
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
+
"""Internet protocol addresses. This supports both addresses used
|
|
14
|
+
by the protocol versions 4 (e.g. `192.168.1.143`) and 6
|
|
15
|
+
(e.g. `0:0:0:0:0:ffff:c0a8:18f`)."""
|
|
16
|
+
|
|
17
|
+
name = "ip"
|
|
18
|
+
group = "ips"
|
|
19
|
+
label = _("IP-Address")
|
|
20
|
+
plural = _("IP-Addresses")
|
|
21
|
+
matchable = True
|
|
22
|
+
pivot = True
|
|
23
|
+
max_length = 64
|
|
13
24
|
|
|
14
|
-
def validate(
|
|
25
|
+
def validate(
|
|
26
|
+
self, value: str, fuzzy: bool = False, format: Optional[str] = None
|
|
27
|
+
) -> bool:
|
|
15
28
|
"""Check to see if this is a valid ip address."""
|
|
16
29
|
try:
|
|
17
|
-
ip_address(
|
|
30
|
+
ip_address(value)
|
|
18
31
|
return True
|
|
19
32
|
except ValueError:
|
|
20
33
|
return False
|
|
21
34
|
|
|
22
|
-
def
|
|
35
|
+
def clean_text(
|
|
36
|
+
self,
|
|
37
|
+
text: str,
|
|
38
|
+
fuzzy: bool = False,
|
|
39
|
+
format: Optional[str] = None,
|
|
40
|
+
proxy: Optional["EntityProxy"] = None,
|
|
41
|
+
) -> Optional[str]:
|
|
23
42
|
"""Create a more clean, but still user-facing version of an
|
|
24
43
|
instance of the type."""
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def specificity(self, value):
|
|
33
|
-
return 1
|
|
34
|
-
|
|
35
|
-
def rdf(self, value):
|
|
36
|
-
return URIRef('ip:%s' % value)
|
|
44
|
+
try:
|
|
45
|
+
return str(ip_address(text))
|
|
46
|
+
except ValueError:
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
def rdf(self, value: str) -> Identifier:
|
|
50
|
+
return URIRef(f"ip:{value}")
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Optional, Sequence, TYPE_CHECKING
|
|
3
|
+
from banal import ensure_list
|
|
4
|
+
|
|
5
|
+
from followthemoney.types.common import PropertyType
|
|
6
|
+
from followthemoney.util import sanitize_text, defer as _
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from followthemoney.proxy import EntityProxy
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class JsonType(PropertyType):
|
|
13
|
+
"""An encoded JSON object. This is used to store raw HTTP headers for documents
|
|
14
|
+
and some other edge cases. It's a really bad idea and we should try to get rid
|
|
15
|
+
of JSON properties."""
|
|
16
|
+
|
|
17
|
+
name = "json"
|
|
18
|
+
group = None
|
|
19
|
+
label = _("Nested data")
|
|
20
|
+
plural = _("Nested data")
|
|
21
|
+
matchable = False
|
|
22
|
+
|
|
23
|
+
def pack(self, obj: Any) -> Optional[str]:
|
|
24
|
+
"""Encode a given value to JSON."""
|
|
25
|
+
# TODO: use a JSON encoder that handles more types?
|
|
26
|
+
if obj is None:
|
|
27
|
+
return None
|
|
28
|
+
return json.dumps(obj)
|
|
29
|
+
|
|
30
|
+
def unpack(self, obj: str) -> Any:
|
|
31
|
+
"""Decode a given JSON object."""
|
|
32
|
+
try:
|
|
33
|
+
return json.loads(obj)
|
|
34
|
+
except Exception:
|
|
35
|
+
return obj
|
|
36
|
+
|
|
37
|
+
def clean(
|
|
38
|
+
self,
|
|
39
|
+
raw: Any,
|
|
40
|
+
fuzzy: bool = False,
|
|
41
|
+
format: Optional[str] = None,
|
|
42
|
+
proxy: Optional["EntityProxy"] = None,
|
|
43
|
+
) -> Optional[str]:
|
|
44
|
+
if not isinstance(raw, str):
|
|
45
|
+
return self.pack(raw)
|
|
46
|
+
else:
|
|
47
|
+
return sanitize_text(raw)
|
|
48
|
+
|
|
49
|
+
def join(self, values: Sequence[str]) -> str:
|
|
50
|
+
"""Turn multiple values into a JSON array."""
|
|
51
|
+
values = [self.unpack(v) for v in ensure_list(values)]
|
|
52
|
+
data = self.pack(values)
|
|
53
|
+
if data is None:
|
|
54
|
+
return "[]"
|
|
55
|
+
return data
|
|
56
|
+
|
|
57
|
+
def node_id(self, value: str) -> None:
|
|
58
|
+
return None
|
followthemoney/types/language.py
CHANGED
|
@@ -1,37 +1,124 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
from followthemoney.
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
2
|
+
from babel.core import Locale
|
|
3
|
+
from rigour.langs import iso_639_alpha3
|
|
4
|
+
|
|
5
|
+
from followthemoney.types.common import EnumType, EnumValues
|
|
6
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
7
|
+
from followthemoney.util import defer as _, gettext
|
|
8
|
+
from followthemoney.util import get_env_list
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from followthemoney.proxy import EntityProxy
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LanguageType(EnumType):
|
|
15
|
+
"""A human written language. This list is arbitrarily limited for some
|
|
16
|
+
weird upstream technical reasons, but we'll happily accept pull requests
|
|
17
|
+
for additional languages once there is a specific need for them to be
|
|
18
|
+
supported."""
|
|
19
|
+
|
|
20
|
+
name = "language"
|
|
21
|
+
group = "languages"
|
|
22
|
+
label = _("Language")
|
|
23
|
+
plural = _("Languages")
|
|
24
|
+
matchable = False
|
|
25
|
+
max_length = 16
|
|
26
|
+
|
|
27
|
+
# Language whitelist
|
|
28
|
+
LANGUAGES = [
|
|
29
|
+
"eng",
|
|
30
|
+
"fra",
|
|
31
|
+
"deu",
|
|
32
|
+
"rus",
|
|
33
|
+
"spa",
|
|
34
|
+
"nld",
|
|
35
|
+
"ron",
|
|
36
|
+
"kat",
|
|
37
|
+
"ara",
|
|
38
|
+
"tur",
|
|
39
|
+
"ltz",
|
|
40
|
+
"ell",
|
|
41
|
+
"lit",
|
|
42
|
+
"ukr",
|
|
43
|
+
"zho",
|
|
44
|
+
"bel",
|
|
45
|
+
"bul",
|
|
46
|
+
"bos",
|
|
47
|
+
"jpn",
|
|
48
|
+
"ces",
|
|
49
|
+
"lav",
|
|
50
|
+
"por",
|
|
51
|
+
"pol",
|
|
52
|
+
"hye",
|
|
53
|
+
"hrv",
|
|
54
|
+
"hin",
|
|
55
|
+
"heb",
|
|
56
|
+
"uzb",
|
|
57
|
+
"mon",
|
|
58
|
+
"urd",
|
|
59
|
+
"sqi",
|
|
60
|
+
"kor",
|
|
61
|
+
"isl",
|
|
62
|
+
"ita",
|
|
63
|
+
"est",
|
|
64
|
+
"nor",
|
|
65
|
+
"fas",
|
|
66
|
+
"swa",
|
|
67
|
+
"slv",
|
|
68
|
+
"slk",
|
|
69
|
+
"aze",
|
|
70
|
+
"tgk",
|
|
71
|
+
"kaz",
|
|
72
|
+
"tuk",
|
|
73
|
+
"kir",
|
|
74
|
+
"hun",
|
|
75
|
+
"dan",
|
|
76
|
+
"afr",
|
|
77
|
+
"swe",
|
|
78
|
+
"srp",
|
|
79
|
+
"ind",
|
|
80
|
+
"kan",
|
|
81
|
+
"mkd",
|
|
82
|
+
"mlt",
|
|
83
|
+
"msa",
|
|
84
|
+
"fin",
|
|
85
|
+
"cat",
|
|
86
|
+
"nep",
|
|
87
|
+
"tgl",
|
|
88
|
+
"fil",
|
|
89
|
+
"mya",
|
|
90
|
+
"khm",
|
|
91
|
+
"cnr",
|
|
92
|
+
]
|
|
93
|
+
LANGUAGES = get_env_list("FTM_LANGUAGES", LANGUAGES)
|
|
94
|
+
LANGUAGES = [lang.lower().strip() for lang in LANGUAGES]
|
|
95
|
+
|
|
96
|
+
def _locale_names(self, locale: Locale) -> EnumValues:
|
|
97
|
+
names = {
|
|
98
|
+
"ara": gettext("Arabic"),
|
|
99
|
+
"nor": gettext("Norwegian"),
|
|
100
|
+
"cnr": gettext("Montenegrin"),
|
|
101
|
+
}
|
|
102
|
+
for lang in self.LANGUAGES:
|
|
103
|
+
if lang not in names:
|
|
104
|
+
names[lang] = lang
|
|
105
|
+
for code, label in locale.languages.items():
|
|
106
|
+
code = iso_639_alpha3(code)
|
|
107
|
+
if code in self.LANGUAGES and names[code] == code:
|
|
108
|
+
names[code] = label
|
|
109
|
+
return names
|
|
110
|
+
|
|
111
|
+
def clean_text(
|
|
112
|
+
self,
|
|
113
|
+
text: str,
|
|
114
|
+
fuzzy: bool = False,
|
|
115
|
+
format: Optional[str] = None,
|
|
116
|
+
proxy: Optional["EntityProxy"] = None,
|
|
117
|
+
) -> Optional[str]:
|
|
118
|
+
code = iso_639_alpha3(text)
|
|
119
|
+
if code not in self.LANGUAGES:
|
|
120
|
+
return None
|
|
121
|
+
return code
|
|
122
|
+
|
|
123
|
+
def rdf(self, value: str) -> Identifier:
|
|
124
|
+
return URIRef(f"iso-639:{value}")
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
2
|
+
from rigour.mime import normalize_mimetype, parse_mimetype
|
|
3
|
+
from rigour.mime import DEFAULT
|
|
4
|
+
|
|
5
|
+
from followthemoney.types.common import PropertyType
|
|
6
|
+
from followthemoney.rdf import URIRef, Identifier
|
|
7
|
+
from followthemoney.util import defer as _
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from followthemoney.proxy import EntityProxy
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MimeType(PropertyType):
|
|
14
|
+
"""A MIME media type are a specification of a content type on a network.
|
|
15
|
+
Each MIME type is assigned by IANA and consists of two parts: the type
|
|
16
|
+
and sub-type. Common examples are: `text/plain`, `application/json` and
|
|
17
|
+
`application/pdf`.
|
|
18
|
+
|
|
19
|
+
MIME type properties do not contain parameters as used in HTTP headers,
|
|
20
|
+
like `charset=UTF-8`."""
|
|
21
|
+
|
|
22
|
+
name = "mimetype"
|
|
23
|
+
group = "mimetypes"
|
|
24
|
+
label = _("MIME-Type")
|
|
25
|
+
plural = _("MIME-Types")
|
|
26
|
+
matchable = False
|
|
27
|
+
|
|
28
|
+
def clean_text(
|
|
29
|
+
self,
|
|
30
|
+
text: str,
|
|
31
|
+
fuzzy: bool = False,
|
|
32
|
+
format: Optional[str] = None,
|
|
33
|
+
proxy: Optional["EntityProxy"] = None,
|
|
34
|
+
) -> Optional[str]:
|
|
35
|
+
text = normalize_mimetype(text)
|
|
36
|
+
if text != DEFAULT:
|
|
37
|
+
return text
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
def rdf(self, value: str) -> Identifier:
|
|
41
|
+
return URIRef(f"urn:mimetype:{value}")
|
|
42
|
+
|
|
43
|
+
def caption(self, value: str) -> str:
|
|
44
|
+
return parse_mimetype(value).label or value
|
followthemoney/types/name.py
CHANGED
|
@@ -1,24 +1,68 @@
|
|
|
1
|
-
from
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, Sequence
|
|
2
|
+
from normality import slugify
|
|
2
3
|
from normality.cleaning import collapse_spaces, strip_quotes
|
|
4
|
+
from rigour.env import MAX_NAME_LENGTH
|
|
5
|
+
from rigour.names import pick_name
|
|
6
|
+
from rigour.text.distance import levenshtein_similarity
|
|
7
|
+
from fingerprints.cleanup import clean_name_light
|
|
3
8
|
|
|
4
9
|
from followthemoney.types.common import PropertyType
|
|
5
10
|
from followthemoney.util import dampen
|
|
11
|
+
from followthemoney.util import defer as _
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from followthemoney.proxy import EntityProxy
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
class NameType(PropertyType):
|
|
9
|
-
name
|
|
10
|
-
|
|
11
|
-
|
|
18
|
+
"""A name used for a person or company. This is assumed to be as complete
|
|
19
|
+
a name as available - when a first name, family name or patronymic are given
|
|
20
|
+
separately, these are stored to string-type properties instead.
|
|
21
|
+
|
|
22
|
+
No validation rules apply, and things having multiple names must be considered
|
|
23
|
+
a perfectly ordinary case."""
|
|
24
|
+
|
|
25
|
+
name = "name"
|
|
26
|
+
group = "names"
|
|
27
|
+
label = _("Name")
|
|
28
|
+
plural = _("Names")
|
|
29
|
+
matchable = True
|
|
30
|
+
pivot = True
|
|
31
|
+
max_length = MAX_NAME_LENGTH
|
|
12
32
|
|
|
13
|
-
def clean_text(
|
|
33
|
+
def clean_text(
|
|
34
|
+
self,
|
|
35
|
+
text: str,
|
|
36
|
+
fuzzy: bool = False,
|
|
37
|
+
format: Optional[str] = None,
|
|
38
|
+
proxy: Optional["EntityProxy"] = None,
|
|
39
|
+
) -> Optional[str]:
|
|
14
40
|
"""Basic clean-up."""
|
|
15
|
-
name = strip_quotes(
|
|
16
|
-
|
|
17
|
-
return name
|
|
41
|
+
name = strip_quotes(text)
|
|
42
|
+
return collapse_spaces(name)
|
|
18
43
|
|
|
19
|
-
def
|
|
44
|
+
def pick(self, values: Sequence[str]) -> Optional[str]:
|
|
45
|
+
"""From a set of names, pick the most plausible user-facing one."""
|
|
46
|
+
return pick_name(list(values))
|
|
47
|
+
|
|
48
|
+
def _specificity(self, value: str) -> float:
|
|
20
49
|
# TODO: insert artificial intelligence here.
|
|
21
|
-
return dampen(3, 50, value)
|
|
50
|
+
return dampen(3, 50, value)
|
|
51
|
+
|
|
52
|
+
def compare(self, left: str, right: str) -> float:
|
|
53
|
+
"""Compare two names for similarity."""
|
|
54
|
+
left_clean = clean_name_light(left)
|
|
55
|
+
right_clean = clean_name_light(right)
|
|
56
|
+
if left_clean is None or right_clean is None:
|
|
57
|
+
return 0.0
|
|
58
|
+
return levenshtein_similarity(
|
|
59
|
+
left_clean,
|
|
60
|
+
right_clean,
|
|
61
|
+
max_length=self.max_length,
|
|
62
|
+
)
|
|
22
63
|
|
|
23
|
-
def
|
|
24
|
-
|
|
64
|
+
def node_id(self, value: str) -> Optional[str]:
|
|
65
|
+
slug = slugify(value)
|
|
66
|
+
if slug is None:
|
|
67
|
+
return None
|
|
68
|
+
return f"name:{slug}"
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from followthemoney.types.common import PropertyType
|
|
5
|
+
from followthemoney.util import defer as _
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NumberType(PropertyType):
|
|
9
|
+
"""A numeric value, like the size of a piece of land, or the value of a
|
|
10
|
+
contract. Since all property values in FtM are strings, this is also a
|
|
11
|
+
string and there is no specified format (e.g. `1,000.00` vs. `1.000,00`).
|
|
12
|
+
|
|
13
|
+
In the future we might want to enable annotations for format, units, or
|
|
14
|
+
even to introduce a separate property type for monetary values."""
|
|
15
|
+
|
|
16
|
+
CAST_RE = re.compile(r"[^0-9\-\.]")
|
|
17
|
+
name = "number"
|
|
18
|
+
label = _("Number")
|
|
19
|
+
plural = _("Numbers")
|
|
20
|
+
matchable = False
|
|
21
|
+
|
|
22
|
+
def node_id(self, value: str) -> None:
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
def to_number(self, value: str) -> Optional[float]:
|
|
26
|
+
try:
|
|
27
|
+
value = self.CAST_RE.sub("", value)
|
|
28
|
+
return float(value)
|
|
29
|
+
except Exception:
|
|
30
|
+
return None
|