followthemoney 1.3.7__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. followthemoney/__init__.py +5 -3
  2. followthemoney/cli/__init__.py +17 -0
  3. followthemoney/cli/aggregate.py +56 -0
  4. followthemoney/cli/cli.py +88 -0
  5. followthemoney/cli/exports.py +121 -0
  6. followthemoney/cli/mapping.py +85 -0
  7. followthemoney/cli/sieve.py +67 -0
  8. followthemoney/cli/util.py +142 -0
  9. followthemoney/compare.py +130 -60
  10. followthemoney/exc.py +19 -6
  11. followthemoney/export/common.py +29 -0
  12. followthemoney/export/csv.py +82 -0
  13. followthemoney/export/excel.py +75 -0
  14. followthemoney/export/graph.py +79 -0
  15. followthemoney/export/neo4j.py +182 -0
  16. followthemoney/export/rdf.py +26 -0
  17. followthemoney/graph.py +308 -0
  18. followthemoney/helpers.py +212 -0
  19. followthemoney/mapping/__init__.py +1 -1
  20. followthemoney/mapping/csv.py +67 -35
  21. followthemoney/mapping/entity.py +116 -44
  22. followthemoney/mapping/property.py +90 -44
  23. followthemoney/mapping/query.py +27 -19
  24. followthemoney/mapping/source.py +15 -5
  25. followthemoney/mapping/sql.py +75 -61
  26. followthemoney/messages.py +13 -7
  27. followthemoney/model.py +108 -56
  28. followthemoney/namespace.py +119 -0
  29. followthemoney/offshore.py +48 -0
  30. followthemoney/ontology.py +77 -0
  31. followthemoney/property.py +204 -71
  32. followthemoney/proxy.py +455 -118
  33. followthemoney/rdf.py +9 -0
  34. followthemoney/schema/Address.yaml +78 -0
  35. followthemoney/schema/Airplane.yaml +17 -10
  36. followthemoney/schema/Analyzable.yaml +54 -0
  37. followthemoney/schema/Article.yaml +16 -0
  38. followthemoney/schema/Assessment.yaml +32 -0
  39. followthemoney/schema/Asset.yaml +10 -4
  40. followthemoney/schema/Associate.yaml +41 -0
  41. followthemoney/schema/Audio.yaml +24 -0
  42. followthemoney/schema/BankAccount.yaml +53 -9
  43. followthemoney/schema/Call.yaml +48 -0
  44. followthemoney/schema/CallForTenders.yaml +117 -0
  45. followthemoney/schema/Company.yaml +37 -12
  46. followthemoney/schema/Contract.yaml +41 -7
  47. followthemoney/schema/ContractAward.yaml +30 -11
  48. followthemoney/schema/CourtCase.yaml +16 -10
  49. followthemoney/schema/CourtCaseParty.yaml +17 -6
  50. followthemoney/schema/CryptoWallet.yaml +48 -0
  51. followthemoney/schema/Debt.yaml +37 -0
  52. followthemoney/schema/Directorship.yaml +17 -4
  53. followthemoney/schema/Document.yaml +72 -139
  54. followthemoney/schema/Documentation.yml +38 -0
  55. followthemoney/schema/EconomicActivity.yaml +32 -17
  56. followthemoney/schema/Email.yaml +76 -0
  57. followthemoney/schema/Employment.yaml +39 -0
  58. followthemoney/schema/Event.yaml +35 -3
  59. followthemoney/schema/Family.yaml +41 -0
  60. followthemoney/schema/Folder.yaml +13 -0
  61. followthemoney/schema/HyperText.yaml +21 -0
  62. followthemoney/schema/Identification.yaml +40 -0
  63. followthemoney/schema/Image.yaml +25 -0
  64. followthemoney/schema/Interest.yaml +3 -6
  65. followthemoney/schema/Interval.yaml +56 -5
  66. followthemoney/schema/LegalEntity.yaml +81 -20
  67. followthemoney/schema/License.yaml +7 -3
  68. followthemoney/schema/Membership.yaml +19 -4
  69. followthemoney/schema/Mention.yaml +54 -0
  70. followthemoney/schema/Message.yaml +78 -0
  71. followthemoney/schema/Note.yaml +23 -0
  72. followthemoney/schema/Occupancy.yaml +44 -0
  73. followthemoney/schema/Organization.yaml +38 -3
  74. followthemoney/schema/Ownership.yaml +16 -4
  75. followthemoney/schema/Package.yaml +17 -0
  76. followthemoney/schema/Page.yaml +43 -0
  77. followthemoney/schema/Pages.yaml +23 -0
  78. followthemoney/schema/Passport.yaml +16 -17
  79. followthemoney/schema/Payment.yaml +38 -7
  80. followthemoney/schema/Person.yaml +61 -5
  81. followthemoney/schema/PlainText.yaml +17 -0
  82. followthemoney/schema/Position.yaml +50 -0
  83. followthemoney/schema/Post.yaml +42 -0
  84. followthemoney/schema/Project.yaml +27 -0
  85. followthemoney/schema/ProjectParticipant.yaml +36 -0
  86. followthemoney/schema/PublicBody.yaml +14 -3
  87. followthemoney/schema/RealEstate.yaml +19 -3
  88. followthemoney/schema/Representation.yaml +17 -6
  89. followthemoney/schema/Sanction.yaml +45 -21
  90. followthemoney/schema/Security.yaml +59 -0
  91. followthemoney/schema/Similar.yaml +37 -0
  92. followthemoney/schema/Succession.yaml +36 -0
  93. followthemoney/schema/Table.yaml +32 -0
  94. followthemoney/schema/TaxRoll.yaml +27 -9
  95. followthemoney/schema/Thing.yaml +69 -13
  96. followthemoney/schema/Trip.yaml +42 -0
  97. followthemoney/schema/UnknownLink.yaml +17 -6
  98. followthemoney/schema/UserAccount.yaml +44 -0
  99. followthemoney/schema/Value.yaml +5 -1
  100. followthemoney/schema/Vehicle.yaml +25 -8
  101. followthemoney/schema/Vessel.yaml +18 -10
  102. followthemoney/schema/Video.yaml +20 -0
  103. followthemoney/schema/Workbook.yaml +18 -0
  104. followthemoney/schema.py +436 -135
  105. followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
  106. followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
  107. followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
  108. followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
  109. followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
  110. followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
  111. followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
  112. followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
  113. followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
  114. followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
  115. followthemoney/translations/fr/followthemoney.po +3861 -0
  116. followthemoney/translations/messages.pot +3021 -725
  117. followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
  118. followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
  119. followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
  120. followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
  121. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
  122. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
  123. followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
  124. followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
  125. followthemoney/translations/ru/followthemoney.po +4221 -0
  126. followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
  127. followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
  128. followthemoney/types/__init__.py +35 -17
  129. followthemoney/types/address.py +50 -21
  130. followthemoney/types/checksum.py +25 -0
  131. followthemoney/types/common.py +233 -88
  132. followthemoney/types/country.py +50 -56
  133. followthemoney/types/date.py +59 -76
  134. followthemoney/types/email.py +66 -35
  135. followthemoney/types/entity.py +66 -13
  136. followthemoney/types/gender.py +66 -0
  137. followthemoney/types/iban.py +47 -28
  138. followthemoney/types/identifier.py +49 -22
  139. followthemoney/types/ip.py +35 -21
  140. followthemoney/types/json.py +58 -0
  141. followthemoney/types/language.py +124 -37
  142. followthemoney/types/mimetype.py +44 -0
  143. followthemoney/types/name.py +56 -12
  144. followthemoney/types/number.py +30 -0
  145. followthemoney/types/phone.py +92 -34
  146. followthemoney/types/registry.py +52 -0
  147. followthemoney/types/string.py +43 -0
  148. followthemoney/types/topic.py +94 -0
  149. followthemoney/types/url.py +39 -17
  150. followthemoney/util.py +139 -45
  151. followthemoney-3.8.1.dist-info/METADATA +153 -0
  152. followthemoney-3.8.1.dist-info/RECORD +157 -0
  153. {followthemoney-1.3.7.dist-info → followthemoney-3.8.1.dist-info}/WHEEL +1 -2
  154. followthemoney-3.8.1.dist-info/entry_points.txt +17 -0
  155. followthemoney-1.3.7.dist-info/LICENSE.txt → followthemoney-3.8.1.dist-info/licenses/LICENSE +1 -1
  156. followthemoney/link.py +0 -75
  157. followthemoney/schema/Associate.yml +0 -19
  158. followthemoney/schema/Family.yml +0 -19
  159. followthemoney/schema/Land.yml +0 -9
  160. followthemoney/schema/Relationship.yaml +0 -26
  161. followthemoney/types/domain.py +0 -50
  162. followthemoney-1.3.7.dist-info/DESCRIPTION.rst +0 -3
  163. followthemoney-1.3.7.dist-info/METADATA +0 -39
  164. followthemoney-1.3.7.dist-info/RECORD +0 -108
  165. followthemoney-1.3.7.dist-info/entry_points.txt +0 -3
  166. followthemoney-1.3.7.dist-info/metadata.json +0 -1
  167. followthemoney-1.3.7.dist-info/namespace_packages.txt +0 -1
  168. followthemoney-1.3.7.dist-info/top_level.txt +0 -3
  169. ns/ontology.py +0 -128
  170. tests/types/test_addresses.py +0 -24
  171. tests/types/test_common.py +0 -32
  172. tests/types/test_countries.py +0 -27
  173. tests/types/test_dates.py +0 -73
  174. tests/types/test_domains.py +0 -23
  175. tests/types/test_emails.py +0 -32
  176. tests/types/test_entity.py +0 -19
  177. tests/types/test_iban.py +0 -109
  178. tests/types/test_identifiers.py +0 -27
  179. tests/types/test_ip.py +0 -29
  180. tests/types/test_languages.py +0 -23
  181. tests/types/test_names.py +0 -33
  182. tests/types/test_phones.py +0 -24
  183. tests/types/test_registry.py +0 -14
  184. tests/types/test_urls.py +0 -23
  185. {ns → followthemoney/export}/__init__.py +0 -0
  186. /tests/types/__init__.py → /followthemoney/py.typed +0 -0
@@ -1,36 +1,50 @@
1
- from rdflib import URIRef
2
- from normality import stringify
1
+ from typing import Optional, TYPE_CHECKING
3
2
  from ipaddress import ip_address
4
3
 
5
4
  from followthemoney.types.common import PropertyType
5
+ from followthemoney.rdf import URIRef, Identifier
6
+ from followthemoney.util import defer as _
7
+
8
+ if TYPE_CHECKING:
9
+ from followthemoney.proxy import EntityProxy
6
10
 
7
11
 
8
12
  class IpType(PropertyType):
9
- name = 'ip'
10
- group = 'ips'
11
- prefix = 'ip'
12
- strong = False
13
+ """Internet protocol addresses. This supports both addresses used
14
+ by the protocol versions 4 (e.g. `192.168.1.143`) and 6
15
+ (e.g. `0:0:0:0:0:ffff:c0a8:18f`)."""
16
+
17
+ name = "ip"
18
+ group = "ips"
19
+ label = _("IP-Address")
20
+ plural = _("IP-Addresses")
21
+ matchable = True
22
+ pivot = True
23
+ max_length = 64
13
24
 
14
- def validate(self, ip, **kwargs):
25
+ def validate(
26
+ self, value: str, fuzzy: bool = False, format: Optional[str] = None
27
+ ) -> bool:
15
28
  """Check to see if this is a valid ip address."""
16
29
  try:
17
- ip_address(ip)
30
+ ip_address(value)
18
31
  return True
19
32
  except ValueError:
20
33
  return False
21
34
 
22
- def clean(self, text, **kwargs):
35
+ def clean_text(
36
+ self,
37
+ text: str,
38
+ fuzzy: bool = False,
39
+ format: Optional[str] = None,
40
+ proxy: Optional["EntityProxy"] = None,
41
+ ) -> Optional[str]:
23
42
  """Create a more clean, but still user-facing version of an
24
43
  instance of the type."""
25
- text = stringify(text)
26
- if text is not None:
27
- try:
28
- return str(ip_address(text))
29
- except ValueError:
30
- return None
31
-
32
- def specificity(self, value):
33
- return 1
34
-
35
- def rdf(self, value):
36
- return URIRef('ip:%s' % value)
44
+ try:
45
+ return str(ip_address(text))
46
+ except ValueError:
47
+ return None
48
+
49
+ def rdf(self, value: str) -> Identifier:
50
+ return URIRef(f"ip:{value}")
@@ -0,0 +1,58 @@
1
+ import json
2
+ from typing import Any, Optional, Sequence, TYPE_CHECKING
3
+ from banal import ensure_list
4
+
5
+ from followthemoney.types.common import PropertyType
6
+ from followthemoney.util import sanitize_text, defer as _
7
+
8
+ if TYPE_CHECKING:
9
+ from followthemoney.proxy import EntityProxy
10
+
11
+
12
+ class JsonType(PropertyType):
13
+ """An encoded JSON object. This is used to store raw HTTP headers for documents
14
+ and some other edge cases. It's a really bad idea and we should try to get rid
15
+ of JSON properties."""
16
+
17
+ name = "json"
18
+ group = None
19
+ label = _("Nested data")
20
+ plural = _("Nested data")
21
+ matchable = False
22
+
23
+ def pack(self, obj: Any) -> Optional[str]:
24
+ """Encode a given value to JSON."""
25
+ # TODO: use a JSON encoder that handles more types?
26
+ if obj is None:
27
+ return None
28
+ return json.dumps(obj)
29
+
30
+ def unpack(self, obj: str) -> Any:
31
+ """Decode a given JSON object."""
32
+ try:
33
+ return json.loads(obj)
34
+ except Exception:
35
+ return obj
36
+
37
+ def clean(
38
+ self,
39
+ raw: Any,
40
+ fuzzy: bool = False,
41
+ format: Optional[str] = None,
42
+ proxy: Optional["EntityProxy"] = None,
43
+ ) -> Optional[str]:
44
+ if not isinstance(raw, str):
45
+ return self.pack(raw)
46
+ else:
47
+ return sanitize_text(raw)
48
+
49
+ def join(self, values: Sequence[str]) -> str:
50
+ """Turn multiple values into a JSON array."""
51
+ values = [self.unpack(v) for v in ensure_list(values)]
52
+ data = self.pack(values)
53
+ if data is None:
54
+ return "[]"
55
+ return data
56
+
57
+ def node_id(self, value: str) -> None:
58
+ return None
@@ -1,37 +1,124 @@
1
- from rdflib import URIRef
2
- from normality import stringify
3
-
4
- from followthemoney.types.common import PropertyType
5
- from followthemoney.util import get_locale
6
-
7
-
8
- class LanguageType(PropertyType):
9
- name = 'language'
10
- group = 'languages'
11
- prefix = 'lang'
12
-
13
- def __init__(self, *args):
14
- self._names = {}
15
-
16
- @property
17
- def names(self):
18
- locale = get_locale()
19
- if locale not in self._names:
20
- self._names[locale] = {}
21
- for code, label in locale.languages.items():
22
- self._names[locale][code.lower()] = label
23
- return self._names[locale]
24
-
25
- def validate(self, text, **kwargs):
26
- text = stringify(text)
27
- if text is None:
28
- return False
29
- return text.lower() in self.names
30
-
31
- def clean_text(self, text, **kwargs):
32
- code = text.lower().strip()
33
- if code in self.names:
34
- return code
35
-
36
- def rdf(self, value):
37
- return URIRef('iso-639:%s' % value)
1
+ from typing import Optional, TYPE_CHECKING
2
+ from babel.core import Locale
3
+ from rigour.langs import iso_639_alpha3
4
+
5
+ from followthemoney.types.common import EnumType, EnumValues
6
+ from followthemoney.rdf import URIRef, Identifier
7
+ from followthemoney.util import defer as _, gettext
8
+ from followthemoney.util import get_env_list
9
+
10
+ if TYPE_CHECKING:
11
+ from followthemoney.proxy import EntityProxy
12
+
13
+
14
+ class LanguageType(EnumType):
15
+ """A human written language. This list is arbitrarily limited for some
16
+ weird upstream technical reasons, but we'll happily accept pull requests
17
+ for additional languages once there is a specific need for them to be
18
+ supported."""
19
+
20
+ name = "language"
21
+ group = "languages"
22
+ label = _("Language")
23
+ plural = _("Languages")
24
+ matchable = False
25
+ max_length = 16
26
+
27
+ # Language whitelist
28
+ LANGUAGES = [
29
+ "eng",
30
+ "fra",
31
+ "deu",
32
+ "rus",
33
+ "spa",
34
+ "nld",
35
+ "ron",
36
+ "kat",
37
+ "ara",
38
+ "tur",
39
+ "ltz",
40
+ "ell",
41
+ "lit",
42
+ "ukr",
43
+ "zho",
44
+ "bel",
45
+ "bul",
46
+ "bos",
47
+ "jpn",
48
+ "ces",
49
+ "lav",
50
+ "por",
51
+ "pol",
52
+ "hye",
53
+ "hrv",
54
+ "hin",
55
+ "heb",
56
+ "uzb",
57
+ "mon",
58
+ "urd",
59
+ "sqi",
60
+ "kor",
61
+ "isl",
62
+ "ita",
63
+ "est",
64
+ "nor",
65
+ "fas",
66
+ "swa",
67
+ "slv",
68
+ "slk",
69
+ "aze",
70
+ "tgk",
71
+ "kaz",
72
+ "tuk",
73
+ "kir",
74
+ "hun",
75
+ "dan",
76
+ "afr",
77
+ "swe",
78
+ "srp",
79
+ "ind",
80
+ "kan",
81
+ "mkd",
82
+ "mlt",
83
+ "msa",
84
+ "fin",
85
+ "cat",
86
+ "nep",
87
+ "tgl",
88
+ "fil",
89
+ "mya",
90
+ "khm",
91
+ "cnr",
92
+ ]
93
+ LANGUAGES = get_env_list("FTM_LANGUAGES", LANGUAGES)
94
+ LANGUAGES = [lang.lower().strip() for lang in LANGUAGES]
95
+
96
+ def _locale_names(self, locale: Locale) -> EnumValues:
97
+ names = {
98
+ "ara": gettext("Arabic"),
99
+ "nor": gettext("Norwegian"),
100
+ "cnr": gettext("Montenegrin"),
101
+ }
102
+ for lang in self.LANGUAGES:
103
+ if lang not in names:
104
+ names[lang] = lang
105
+ for code, label in locale.languages.items():
106
+ code = iso_639_alpha3(code)
107
+ if code in self.LANGUAGES and names[code] == code:
108
+ names[code] = label
109
+ return names
110
+
111
+ def clean_text(
112
+ self,
113
+ text: str,
114
+ fuzzy: bool = False,
115
+ format: Optional[str] = None,
116
+ proxy: Optional["EntityProxy"] = None,
117
+ ) -> Optional[str]:
118
+ code = iso_639_alpha3(text)
119
+ if code not in self.LANGUAGES:
120
+ return None
121
+ return code
122
+
123
+ def rdf(self, value: str) -> Identifier:
124
+ return URIRef(f"iso-639:{value}")
@@ -0,0 +1,44 @@
1
+ from typing import Optional, TYPE_CHECKING
2
+ from rigour.mime import normalize_mimetype, parse_mimetype
3
+ from rigour.mime import DEFAULT
4
+
5
+ from followthemoney.types.common import PropertyType
6
+ from followthemoney.rdf import URIRef, Identifier
7
+ from followthemoney.util import defer as _
8
+
9
+ if TYPE_CHECKING:
10
+ from followthemoney.proxy import EntityProxy
11
+
12
+
13
+ class MimeType(PropertyType):
14
+ """A MIME media type are a specification of a content type on a network.
15
+ Each MIME type is assigned by IANA and consists of two parts: the type
16
+ and sub-type. Common examples are: `text/plain`, `application/json` and
17
+ `application/pdf`.
18
+
19
+ MIME type properties do not contain parameters as used in HTTP headers,
20
+ like `charset=UTF-8`."""
21
+
22
+ name = "mimetype"
23
+ group = "mimetypes"
24
+ label = _("MIME-Type")
25
+ plural = _("MIME-Types")
26
+ matchable = False
27
+
28
+ def clean_text(
29
+ self,
30
+ text: str,
31
+ fuzzy: bool = False,
32
+ format: Optional[str] = None,
33
+ proxy: Optional["EntityProxy"] = None,
34
+ ) -> Optional[str]:
35
+ text = normalize_mimetype(text)
36
+ if text != DEFAULT:
37
+ return text
38
+ return None
39
+
40
+ def rdf(self, value: str) -> Identifier:
41
+ return URIRef(f"urn:mimetype:{value}")
42
+
43
+ def caption(self, value: str) -> str:
44
+ return parse_mimetype(value).label or value
@@ -1,24 +1,68 @@
1
- from Levenshtein import jaro_winkler
1
+ from typing import TYPE_CHECKING, Optional, Sequence
2
+ from normality import slugify
2
3
  from normality.cleaning import collapse_spaces, strip_quotes
4
+ from rigour.env import MAX_NAME_LENGTH
5
+ from rigour.names import pick_name
6
+ from rigour.text.distance import levenshtein_similarity
7
+ from fingerprints.cleanup import clean_name_light
3
8
 
4
9
  from followthemoney.types.common import PropertyType
5
10
  from followthemoney.util import dampen
11
+ from followthemoney.util import defer as _
12
+
13
+ if TYPE_CHECKING:
14
+ from followthemoney.proxy import EntityProxy
6
15
 
7
16
 
8
17
  class NameType(PropertyType):
9
- name = 'name'
10
- group = 'names'
11
- prefix = 'n'
18
+ """A name used for a person or company. This is assumed to be as complete
19
+ a name as available - when a first name, family name or patronymic are given
20
+ separately, these are stored to string-type properties instead.
21
+
22
+ No validation rules apply, and things having multiple names must be considered
23
+ a perfectly ordinary case."""
24
+
25
+ name = "name"
26
+ group = "names"
27
+ label = _("Name")
28
+ plural = _("Names")
29
+ matchable = True
30
+ pivot = True
31
+ max_length = MAX_NAME_LENGTH
12
32
 
13
- def clean_text(self, name, **kwargs):
33
+ def clean_text(
34
+ self,
35
+ text: str,
36
+ fuzzy: bool = False,
37
+ format: Optional[str] = None,
38
+ proxy: Optional["EntityProxy"] = None,
39
+ ) -> Optional[str]:
14
40
  """Basic clean-up."""
15
- name = strip_quotes(name)
16
- name = collapse_spaces(name)
17
- return name
41
+ name = strip_quotes(text)
42
+ return collapse_spaces(name)
18
43
 
19
- def specificity(self, value):
44
+ def pick(self, values: Sequence[str]) -> Optional[str]:
45
+ """From a set of names, pick the most plausible user-facing one."""
46
+ return pick_name(list(values))
47
+
48
+ def _specificity(self, value: str) -> float:
20
49
  # TODO: insert artificial intelligence here.
21
- return dampen(3, 50, value) * .8
50
+ return dampen(3, 50, value)
51
+
52
+ def compare(self, left: str, right: str) -> float:
53
+ """Compare two names for similarity."""
54
+ left_clean = clean_name_light(left)
55
+ right_clean = clean_name_light(right)
56
+ if left_clean is None or right_clean is None:
57
+ return 0.0
58
+ return levenshtein_similarity(
59
+ left_clean,
60
+ right_clean,
61
+ max_length=self.max_length,
62
+ )
22
63
 
23
- def compare(self, left, right):
24
- return jaro_winkler(left, right)
64
+ def node_id(self, value: str) -> Optional[str]:
65
+ slug = slugify(value)
66
+ if slug is None:
67
+ return None
68
+ return f"name:{slug}"
@@ -0,0 +1,30 @@
1
+ import re
2
+ from typing import Optional
3
+
4
+ from followthemoney.types.common import PropertyType
5
+ from followthemoney.util import defer as _
6
+
7
+
8
+ class NumberType(PropertyType):
9
+ """A numeric value, like the size of a piece of land, or the value of a
10
+ contract. Since all property values in FtM are strings, this is also a
11
+ string and there is no specified format (e.g. `1,000.00` vs. `1.000,00`).
12
+
13
+ In the future we might want to enable annotations for format, units, or
14
+ even to introduce a separate property type for monetary values."""
15
+
16
+ CAST_RE = re.compile(r"[^0-9\-\.]")
17
+ name = "number"
18
+ label = _("Number")
19
+ plural = _("Numbers")
20
+ matchable = False
21
+
22
+ def node_id(self, value: str) -> None:
23
+ return None
24
+
25
+ def to_number(self, value: str) -> Optional[float]:
26
+ try:
27
+ value = self.CAST_RE.sub("", value)
28
+ return float(value)
29
+ except Exception:
30
+ return None
@@ -1,31 +1,54 @@
1
- from rdflib import URIRef
2
- from banal import ensure_list
3
- from phonenumbers import geocoder
1
+ from typing import Iterable, Optional, TYPE_CHECKING
4
2
  from phonenumbers import parse as parse_number
5
- from phonenumbers import is_possible_number, is_valid_number, format_number
6
- from phonenumbers import PhoneNumberFormat
7
- from phonenumbers.phonenumberutil import NumberParseException
3
+ from phonenumbers import is_valid_number, format_number
4
+ from phonenumbers import PhoneNumber, PhoneNumberFormat
5
+ from phonenumbers.phonenumberutil import region_code_for_number, NumberParseException
8
6
 
9
7
  from followthemoney.types.common import PropertyType
8
+ from followthemoney.rdf import URIRef, Identifier
9
+ from followthemoney.util import defer as _
10
+ from followthemoney.util import dampen
11
+
12
+ if TYPE_CHECKING:
13
+ from followthemoney.proxy import EntityProxy
14
+
15
+
16
+ # TODO: for json schema export
17
+ # https://stackoverflow.com/questions/6478875/regular-expression-matching-e-164-formatted-phone-numbers
10
18
 
11
19
 
12
20
  class PhoneType(PropertyType):
13
- name = 'phone'
14
- group = 'phones'
15
- prefix = 'tel'
16
- strong = False
17
-
18
- def _clean_countries(self, countries, country):
19
- result = set([None])
20
- countries = ensure_list(countries)
21
- countries.extend(ensure_list(country))
22
- for country in countries:
23
- if isinstance(country, str):
24
- country = country.strip().upper()
25
- result.add(country)
26
- return result
27
-
28
- def clean_text(self, number, countries=None, country=None, **kwargs):
21
+ """A phone number in E.164 format. This means that phone numbers always
22
+ include an international country prefix (e.g. `+38760183628`). The
23
+ cleaning and validation functions for this try to be smart about by
24
+ accepting a list of countries as an argument in order to add the number
25
+ prefix.
26
+
27
+ When adding a property of this type to an entity, any country-type properties
28
+ defined for the entity are considered for validation. That means that adding a
29
+ phone number to an entity before adding a country can have a different
30
+ validation outcome from doing the two operations the other way around. Always
31
+ define the country first."""
32
+
33
+ name = "phone"
34
+ group = "phones"
35
+ label = _("Phone number")
36
+ plural = _("Phone numbers")
37
+ matchable = True
38
+ pivot = True
39
+ max_length = 64
40
+
41
+ def _clean_countries(
42
+ self, proxy: Optional["EntityProxy"]
43
+ ) -> Iterable[Optional[str]]:
44
+ yield None
45
+ if proxy is not None:
46
+ for country in proxy.countries:
47
+ yield country.upper()
48
+
49
+ def _parse_number(
50
+ self, number: str, proxy: Optional["EntityProxy"] = None
51
+ ) -> Iterable[PhoneNumber]:
29
52
  """Parse a phone number and return in international format.
30
53
 
31
54
  If no valid phone number can be detected, None is returned. If
@@ -34,24 +57,59 @@ class PhoneType(PropertyType):
34
57
 
35
58
  https://github.com/daviddrysdale/python-phonenumbers
36
59
  """
37
- for code in self._clean_countries(countries, country):
60
+ for code in self._clean_countries(proxy):
38
61
  try:
39
- num = parse_number(number, code)
40
- if is_possible_number(num):
41
- if is_valid_number(num):
42
- return format_number(num, PhoneNumberFormat.E164)
62
+ yield parse_number(number, code)
43
63
  except NumberParseException:
44
64
  pass
45
65
 
46
- def specificity(self, value):
47
- return 1
66
+ def validate(
67
+ self, value: str, fuzzy: bool = False, format: Optional[str] = None
68
+ ) -> bool:
69
+ for num in self._parse_number(value):
70
+ if is_valid_number(num):
71
+ return True
72
+ return False
73
+
74
+ def clean_text(
75
+ self,
76
+ text: str,
77
+ fuzzy: bool = False,
78
+ format: Optional[str] = None,
79
+ proxy: Optional["EntityProxy"] = None,
80
+ ) -> Optional[str]:
81
+ for num in self._parse_number(text, proxy=proxy):
82
+ if is_valid_number(num):
83
+ return str(format_number(num, PhoneNumberFormat.E164))
84
+ return None
48
85
 
49
- def country_hint(self, value):
86
+ def country_hint(self, value: str) -> Optional[str]:
50
87
  try:
51
88
  number = parse_number(value)
52
- return geocoder.region_code_for_number(number).lower()
89
+ code = region_code_for_number(number)
90
+ if code is None:
91
+ return None
92
+ return str(code).lower()
53
93
  except NumberParseException:
54
- pass
94
+ return None
95
+
96
+ def _specificity(self, value: str) -> float:
97
+ # TODO: insert artificial intelligence here.
98
+ return dampen(7, 11, value)
99
+
100
+ def rdf(self, value: str) -> Identifier:
101
+ node_id = self.node_id(value)
102
+ if node_id is not None:
103
+ return URIRef(node_id)
104
+ raise ValueError("Invalid phone number for serialisation: %s" % value)
105
+
106
+ def node_id(self, value: str) -> Optional[str]:
107
+ return f"tel:{value}"
55
108
 
56
- def rdf(self, value):
57
- return URIRef('tel:%s' % value)
109
+ def caption(self, value: str) -> str:
110
+ try:
111
+ number = parse_number(value)
112
+ formatted = format_number(number, PhoneNumberFormat.INTERNATIONAL)
113
+ return str(formatted)
114
+ except NumberParseException:
115
+ return value
@@ -0,0 +1,52 @@
1
+ from banal import ensure_list
2
+ from typing import Iterable, Set, Dict, Type, Union, List, Optional
3
+
4
+ from followthemoney.types.common import PropertyType
5
+
6
+
7
+ class Registry(object):
8
+ """This registry keeps the processing helpers for all property types
9
+ in the system. They are instantiated as singletons when the system is first
10
+ loaded. The registry can be used to get a type, which can itself then
11
+ clean, validate or format values of that type."""
12
+
13
+ def __init__(self) -> None:
14
+ self.named: Dict[str, PropertyType] = {}
15
+ self.matchable: Set[PropertyType] = set()
16
+ self.types: Set[PropertyType] = set()
17
+ self.groups: Dict[str, PropertyType] = {}
18
+ self.pivots: Set[PropertyType] = set()
19
+
20
+ def add(self, clazz: Type[PropertyType]) -> None:
21
+ """Add a singleton class."""
22
+ type_ = clazz()
23
+ self.named[clazz.name] = type_
24
+ self.types.add(type_)
25
+ if type_.matchable:
26
+ self.matchable.add(type_)
27
+ if type_.pivot:
28
+ self.pivots.add(type_)
29
+ if type_.group is not None:
30
+ self.groups[type_.group] = type_
31
+
32
+ def get(self, name: Union[str, PropertyType]) -> Optional[PropertyType]:
33
+ """For a given property type name, get its type object. This can also
34
+ be used via getattr, e.g. ``registry.phone``."""
35
+ # Allow transparent re-checking.
36
+ if isinstance(name, PropertyType):
37
+ return name
38
+ return self.named.get(name)
39
+
40
+ def get_types(
41
+ self, names: Iterable[Union[str, PropertyType]]
42
+ ) -> List[PropertyType]:
43
+ """Get a list of all type names."""
44
+ names = ensure_list(names)
45
+ types = [self.get(n) for n in names]
46
+ return [t for t in types if t is not None]
47
+
48
+ def __getitem__(self, name: str) -> PropertyType:
49
+ return self.named[name]
50
+
51
+ def __getattr__(self, name: str) -> PropertyType:
52
+ return self.named[name]