followthemoney 1.3.6__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. followthemoney/__init__.py +5 -3
  2. followthemoney/cli/__init__.py +17 -0
  3. followthemoney/cli/aggregate.py +56 -0
  4. followthemoney/cli/cli.py +88 -0
  5. followthemoney/cli/exports.py +121 -0
  6. followthemoney/cli/mapping.py +85 -0
  7. followthemoney/cli/sieve.py +67 -0
  8. followthemoney/cli/util.py +142 -0
  9. followthemoney/compare.py +132 -55
  10. followthemoney/exc.py +19 -6
  11. followthemoney/export/common.py +29 -0
  12. followthemoney/export/csv.py +82 -0
  13. followthemoney/export/excel.py +75 -0
  14. followthemoney/export/graph.py +79 -0
  15. followthemoney/export/neo4j.py +182 -0
  16. followthemoney/export/rdf.py +26 -0
  17. followthemoney/graph.py +308 -0
  18. followthemoney/helpers.py +212 -0
  19. followthemoney/mapping/__init__.py +1 -1
  20. followthemoney/mapping/csv.py +67 -35
  21. followthemoney/mapping/entity.py +116 -44
  22. followthemoney/mapping/property.py +90 -44
  23. followthemoney/mapping/query.py +27 -19
  24. followthemoney/mapping/source.py +15 -5
  25. followthemoney/mapping/sql.py +75 -61
  26. followthemoney/messages.py +13 -7
  27. followthemoney/model.py +108 -56
  28. followthemoney/namespace.py +119 -0
  29. followthemoney/offshore.py +48 -0
  30. followthemoney/ontology.py +77 -0
  31. followthemoney/property.py +204 -71
  32. followthemoney/proxy.py +455 -118
  33. followthemoney/rdf.py +9 -0
  34. followthemoney/schema/Address.yaml +78 -0
  35. followthemoney/schema/Airplane.yaml +17 -10
  36. followthemoney/schema/Analyzable.yaml +54 -0
  37. followthemoney/schema/Article.yaml +16 -0
  38. followthemoney/schema/Assessment.yaml +32 -0
  39. followthemoney/schema/Asset.yaml +10 -4
  40. followthemoney/schema/Associate.yaml +41 -0
  41. followthemoney/schema/Audio.yaml +24 -0
  42. followthemoney/schema/BankAccount.yaml +53 -9
  43. followthemoney/schema/Call.yaml +48 -0
  44. followthemoney/schema/CallForTenders.yaml +117 -0
  45. followthemoney/schema/Company.yaml +37 -12
  46. followthemoney/schema/Contract.yaml +41 -7
  47. followthemoney/schema/ContractAward.yaml +30 -11
  48. followthemoney/schema/CourtCase.yaml +16 -10
  49. followthemoney/schema/CourtCaseParty.yaml +17 -6
  50. followthemoney/schema/CryptoWallet.yaml +48 -0
  51. followthemoney/schema/Debt.yaml +37 -0
  52. followthemoney/schema/Directorship.yaml +17 -4
  53. followthemoney/schema/Document.yaml +72 -139
  54. followthemoney/schema/Documentation.yml +38 -0
  55. followthemoney/schema/EconomicActivity.yaml +32 -17
  56. followthemoney/schema/Email.yaml +76 -0
  57. followthemoney/schema/Employment.yaml +39 -0
  58. followthemoney/schema/Event.yaml +35 -3
  59. followthemoney/schema/Family.yaml +41 -0
  60. followthemoney/schema/Folder.yaml +13 -0
  61. followthemoney/schema/HyperText.yaml +21 -0
  62. followthemoney/schema/Identification.yaml +40 -0
  63. followthemoney/schema/Image.yaml +25 -0
  64. followthemoney/schema/Interest.yaml +3 -6
  65. followthemoney/schema/Interval.yaml +56 -5
  66. followthemoney/schema/LegalEntity.yaml +81 -20
  67. followthemoney/schema/License.yaml +7 -3
  68. followthemoney/schema/Membership.yaml +19 -4
  69. followthemoney/schema/Mention.yaml +54 -0
  70. followthemoney/schema/Message.yaml +73 -0
  71. followthemoney/schema/Note.yaml +23 -0
  72. followthemoney/schema/Occupancy.yaml +40 -0
  73. followthemoney/schema/Organization.yaml +38 -3
  74. followthemoney/schema/Ownership.yaml +16 -4
  75. followthemoney/schema/Package.yaml +17 -0
  76. followthemoney/schema/Page.yaml +43 -0
  77. followthemoney/schema/Pages.yaml +23 -0
  78. followthemoney/schema/Passport.yaml +15 -17
  79. followthemoney/schema/Payment.yaml +38 -7
  80. followthemoney/schema/Person.yaml +61 -5
  81. followthemoney/schema/PlainText.yaml +17 -0
  82. followthemoney/schema/Position.yaml +50 -0
  83. followthemoney/schema/Post.yaml +42 -0
  84. followthemoney/schema/Project.yaml +27 -0
  85. followthemoney/schema/ProjectParticipant.yaml +36 -0
  86. followthemoney/schema/PublicBody.yaml +14 -3
  87. followthemoney/schema/RealEstate.yaml +19 -3
  88. followthemoney/schema/Representation.yaml +17 -6
  89. followthemoney/schema/Sanction.yaml +44 -20
  90. followthemoney/schema/Security.yaml +59 -0
  91. followthemoney/schema/Similar.yaml +37 -0
  92. followthemoney/schema/Succession.yaml +36 -0
  93. followthemoney/schema/Table.yaml +32 -0
  94. followthemoney/schema/TaxRoll.yaml +27 -9
  95. followthemoney/schema/Thing.yaml +69 -13
  96. followthemoney/schema/Trip.yaml +42 -0
  97. followthemoney/schema/UnknownLink.yaml +17 -6
  98. followthemoney/schema/UserAccount.yaml +44 -0
  99. followthemoney/schema/Value.yaml +5 -1
  100. followthemoney/schema/Vehicle.yaml +25 -8
  101. followthemoney/schema/Vessel.yaml +18 -10
  102. followthemoney/schema/Video.yaml +20 -0
  103. followthemoney/schema/Workbook.yaml +18 -0
  104. followthemoney/schema.py +406 -135
  105. followthemoney/translations/ar/LC_MESSAGES/followthemoney.mo +0 -0
  106. followthemoney/translations/ar/LC_MESSAGES/followthemoney.po +2900 -787
  107. followthemoney/translations/bs/LC_MESSAGES/followthemoney.mo +0 -0
  108. followthemoney/translations/bs/LC_MESSAGES/followthemoney.po +2108 -520
  109. followthemoney/translations/de/LC_MESSAGES/followthemoney.mo +0 -0
  110. followthemoney/translations/de/LC_MESSAGES/followthemoney.po +2902 -782
  111. followthemoney/translations/es/LC_MESSAGES/followthemoney.mo +0 -0
  112. followthemoney/translations/es/LC_MESSAGES/followthemoney.po +2893 -779
  113. followthemoney/translations/fr/LC_MESSAGES/followthemoney.mo +0 -0
  114. followthemoney/translations/fr/LC_MESSAGES/followthemoney.po +4362 -0
  115. followthemoney/translations/fr/followthemoney.po +3861 -0
  116. followthemoney/translations/messages.pot +3021 -725
  117. followthemoney/translations/nb/LC_MESSAGES/followthemoney.mo +0 -0
  118. followthemoney/translations/nb/LC_MESSAGES/followthemoney.po +3778 -0
  119. followthemoney/translations/nl/LC_MESSAGES/followthemoney.mo +0 -0
  120. followthemoney/translations/nl/LC_MESSAGES/followthemoney.po +3837 -0
  121. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.mo +0 -0
  122. followthemoney/translations/pt_BR/LC_MESSAGES/followthemoney.po +3784 -0
  123. followthemoney/translations/ru/LC_MESSAGES/followthemoney.mo +0 -0
  124. followthemoney/translations/ru/LC_MESSAGES/followthemoney.po +2837 -539
  125. followthemoney/translations/ru/followthemoney.po +4221 -0
  126. followthemoney/translations/tr/LC_MESSAGES/followthemoney.mo +0 -0
  127. followthemoney/translations/tr/LC_MESSAGES/followthemoney.po +2073 -491
  128. followthemoney/types/__init__.py +35 -17
  129. followthemoney/types/address.py +41 -21
  130. followthemoney/types/checksum.py +25 -0
  131. followthemoney/types/common.py +233 -88
  132. followthemoney/types/country.py +89 -56
  133. followthemoney/types/date.py +59 -76
  134. followthemoney/types/email.py +66 -35
  135. followthemoney/types/entity.py +66 -13
  136. followthemoney/types/gender.py +66 -0
  137. followthemoney/types/iban.py +47 -28
  138. followthemoney/types/identifier.py +49 -22
  139. followthemoney/types/ip.py +35 -21
  140. followthemoney/types/json.py +58 -0
  141. followthemoney/types/language.py +124 -37
  142. followthemoney/types/mimetype.py +44 -0
  143. followthemoney/types/name.py +56 -12
  144. followthemoney/types/number.py +30 -0
  145. followthemoney/types/phone.py +92 -34
  146. followthemoney/types/registry.py +52 -0
  147. followthemoney/types/string.py +43 -0
  148. followthemoney/types/topic.py +94 -0
  149. followthemoney/types/url.py +39 -17
  150. followthemoney/util.py +139 -45
  151. followthemoney-3.8.0.dist-info/METADATA +153 -0
  152. followthemoney-3.8.0.dist-info/RECORD +157 -0
  153. {followthemoney-1.3.6.dist-info → followthemoney-3.8.0.dist-info}/WHEEL +1 -2
  154. followthemoney-3.8.0.dist-info/entry_points.txt +17 -0
  155. followthemoney-1.3.6.dist-info/LICENSE.txt → followthemoney-3.8.0.dist-info/licenses/LICENSE +1 -1
  156. followthemoney/link.py +0 -75
  157. followthemoney/schema/Associate.yml +0 -19
  158. followthemoney/schema/Family.yml +0 -19
  159. followthemoney/schema/Land.yml +0 -9
  160. followthemoney/schema/Relationship.yaml +0 -26
  161. followthemoney/types/domain.py +0 -50
  162. followthemoney-1.3.6.dist-info/DESCRIPTION.rst +0 -3
  163. followthemoney-1.3.6.dist-info/METADATA +0 -39
  164. followthemoney-1.3.6.dist-info/RECORD +0 -108
  165. followthemoney-1.3.6.dist-info/entry_points.txt +0 -3
  166. followthemoney-1.3.6.dist-info/metadata.json +0 -1
  167. followthemoney-1.3.6.dist-info/namespace_packages.txt +0 -1
  168. followthemoney-1.3.6.dist-info/top_level.txt +0 -3
  169. ns/ontology.py +0 -128
  170. tests/types/test_addresses.py +0 -24
  171. tests/types/test_common.py +0 -27
  172. tests/types/test_countries.py +0 -21
  173. tests/types/test_dates.py +0 -72
  174. tests/types/test_domains.py +0 -23
  175. tests/types/test_emails.py +0 -30
  176. tests/types/test_entity.py +0 -16
  177. tests/types/test_iban.py +0 -109
  178. tests/types/test_identifiers.py +0 -25
  179. tests/types/test_ip.py +0 -26
  180. tests/types/test_languages.py +0 -20
  181. tests/types/test_names.py +0 -33
  182. tests/types/test_phones.py +0 -24
  183. tests/types/test_registry.py +0 -14
  184. tests/types/test_urls.py +0 -23
  185. {ns → followthemoney/export}/__init__.py +0 -0
  186. /tests/types/__init__.py → /followthemoney/py.typed +0 -0
@@ -1,39 +1,58 @@
1
- from rdflib import URIRef
2
- from normality import stringify
3
- from schwifty import IBAN
1
+ from typing import Optional, TYPE_CHECKING
2
+ from rigour.ids import IBAN
4
3
 
5
4
  from followthemoney.types.common import PropertyType
5
+ from followthemoney.rdf import URIRef, Identifier
6
+ from followthemoney.util import sanitize_text, defer as _
7
+
8
+ if TYPE_CHECKING:
9
+ from followthemoney.proxy import EntityProxy
6
10
 
7
11
 
8
12
  class IbanType(PropertyType):
9
- name = 'iban'
10
- group = 'ibans'
11
- prefix = 'iban'
12
- strong = False
13
-
14
- def validate(self, iban, **kwargs):
15
- iban = stringify(iban)
16
- if iban is None:
17
- return False
18
- try:
19
- IBAN(iban)
20
- return True
21
- except ValueError as ex:
22
- print(ex)
13
+ """An international bank account number, as defined in ISO 13616. IBANs are
14
+ managed by SWIFT used in the European SEPA payment system.
15
+
16
+ A notable aspect of IBANs is that they share a country prefix and validation
17
+ mechanism, but the specific length of an IBAN is dependent on the country
18
+ code defined in the first two characters: `NO8330001234567` and
19
+ `CY21002001950000357001234567` are both valid values."""
20
+
21
+ name = "iban"
22
+ group = "ibans"
23
+ label = _("IBAN")
24
+ plural = _("IBANs")
25
+ matchable = True
26
+ pivot = True
27
+ max_length = 64
28
+
29
+ def validate(
30
+ self, value: str, fuzzy: bool = False, format: Optional[str] = None
31
+ ) -> bool:
32
+ text = sanitize_text(value)
33
+ if text is None:
23
34
  return False
24
-
25
- def clean_text(self, text, **kwargs):
35
+ return IBAN.is_valid(text)
36
+
37
+ def clean_text(
38
+ self,
39
+ text: str,
40
+ fuzzy: bool = False,
41
+ format: Optional[str] = None,
42
+ proxy: Optional["EntityProxy"] = None,
43
+ ) -> Optional[str]:
26
44
  """Create a more clean, but still user-facing version of an
27
45
  instance of the type."""
28
- return text.replace(" ", "").upper()
46
+ return IBAN.normalize(text)
47
+
48
+ def country_hint(self, value: str) -> str:
49
+ return value[:2].lower()
29
50
 
30
- def specificity(self, value):
31
- return 1
51
+ def rdf(self, value: str) -> Identifier:
52
+ return URIRef(self.node_id(value))
32
53
 
33
- def country_hint(self, value):
34
- value = stringify(value)
35
- if value is not None:
36
- return value[:2].lower()
54
+ def node_id(self, value: str) -> str:
55
+ return f"iban:{value.upper()}"
37
56
 
38
- def rdf(self, value):
39
- return URIRef('iban:%s' % value)
57
+ def caption(self, value: str) -> str:
58
+ return IBAN.format(value)
@@ -1,34 +1,61 @@
1
1
  import re
2
- from normality import normalize
2
+ from typing import Optional, TYPE_CHECKING
3
+ from rigour.ids import get_identifier_format_names, get_identifier_format
3
4
 
4
5
  from followthemoney.types.common import PropertyType
6
+ from followthemoney.util import dampen, shortest, longest
7
+ from followthemoney.util import defer as _
8
+
9
+ if TYPE_CHECKING:
10
+ from followthemoney.proxy import EntityProxy
5
11
 
6
12
 
7
13
  class IdentifierType(PropertyType):
8
- """Used for registration numbers, codes etc."""
9
- COMPARE_CLEAN = re.compile('[\W_]+')
10
- name = 'identifier'
11
- group = 'identifiers'
12
- prefix = 'ident'
13
- strong = False
14
-
15
- def normalize(self, text, **kwargs):
16
- """Normalize for comparison."""
17
- ids = super(IdentifierType, self).normalize(text, **kwargs)
18
- return [normalize(i) for i in ids]
19
-
20
- def clean_compare(self, value):
14
+ """Used for registration numbers and other codes assigned by an authority
15
+ to identify an entity. This might include tax identifiers and statistical
16
+ codes.
17
+
18
+ Since identifiers are high-value criteria when comparing two entities, numbers
19
+ should only be modelled as identifiers if they are long enough to be meaningful.
20
+ Four- or five-digit industry classifiers create more noise than value."""
21
+
22
+ COMPARE_CLEAN = re.compile(r"[\W_]+")
23
+ name = "identifier"
24
+ group = "identifiers"
25
+ label = _("Identifier")
26
+ plural = _("Identifiers")
27
+ matchable = True
28
+ pivot = True
29
+ max_length = 64
30
+
31
+ def clean_text(
32
+ self,
33
+ text: str,
34
+ fuzzy: bool = False,
35
+ format: Optional[str] = None,
36
+ proxy: Optional["EntityProxy"] = None,
37
+ ) -> Optional[str]:
38
+ if format in get_identifier_format_names():
39
+ format_ = get_identifier_format(format)
40
+ return format_.normalize(text)
41
+ return text
42
+
43
+ def clean_compare(self, value: str) -> str:
21
44
  # TODO: should this be used for normalization?
22
- value = self.COMPARE_CLEAN.sub('', value)
45
+ value = self.COMPARE_CLEAN.sub("", value)
23
46
  return value.lower()
24
47
 
25
- def compare(self, left, right):
48
+ def compare(self, left: str, right: str) -> float:
26
49
  left = self.clean_compare(left)
27
50
  right = self.clean_compare(right)
28
51
  if left == right:
29
- return .9
30
- if left in right:
31
- return .7
32
- if right in left:
33
- return .7
34
- return 0
52
+ return 1.0
53
+ elif left in right or right in left:
54
+ return len(shortest(left, right)) / len(longest(left, right))
55
+ return 0.0
56
+
57
+ def _specificity(self, value: str) -> float:
58
+ return dampen(4, 10, value)
59
+
60
+ def node_id(self, value: str) -> str:
61
+ return f"id:{value}"
@@ -1,36 +1,50 @@
1
- from rdflib import URIRef
2
- from normality import stringify
1
+ from typing import Optional, TYPE_CHECKING
3
2
  from ipaddress import ip_address
4
3
 
5
4
  from followthemoney.types.common import PropertyType
5
+ from followthemoney.rdf import URIRef, Identifier
6
+ from followthemoney.util import defer as _
7
+
8
+ if TYPE_CHECKING:
9
+ from followthemoney.proxy import EntityProxy
6
10
 
7
11
 
8
12
  class IpType(PropertyType):
9
- name = 'ip'
10
- group = 'ips'
11
- prefix = 'ip'
12
- strong = False
13
+ """Internet protocol addresses. This supports both addresses used
14
+ by the protocol versions 4 (e.g. `192.168.1.143`) and 6
15
+ (e.g. `0:0:0:0:0:ffff:c0a8:18f`)."""
16
+
17
+ name = "ip"
18
+ group = "ips"
19
+ label = _("IP-Address")
20
+ plural = _("IP-Addresses")
21
+ matchable = True
22
+ pivot = True
23
+ max_length = 64
13
24
 
14
- def validate(self, ip, **kwargs):
25
+ def validate(
26
+ self, value: str, fuzzy: bool = False, format: Optional[str] = None
27
+ ) -> bool:
15
28
  """Check to see if this is a valid ip address."""
16
29
  try:
17
- ip_address(ip)
30
+ ip_address(value)
18
31
  return True
19
32
  except ValueError:
20
33
  return False
21
34
 
22
- def clean(self, text, **kwargs):
35
+ def clean_text(
36
+ self,
37
+ text: str,
38
+ fuzzy: bool = False,
39
+ format: Optional[str] = None,
40
+ proxy: Optional["EntityProxy"] = None,
41
+ ) -> Optional[str]:
23
42
  """Create a more clean, but still user-facing version of an
24
43
  instance of the type."""
25
- text = stringify(text)
26
- if text is not None:
27
- try:
28
- return str(ip_address(text))
29
- except ValueError:
30
- return None
31
-
32
- def specificity(self, value):
33
- return 1
34
-
35
- def rdf(self, value):
36
- return URIRef('ip:%s' % value)
44
+ try:
45
+ return str(ip_address(text))
46
+ except ValueError:
47
+ return None
48
+
49
+ def rdf(self, value: str) -> Identifier:
50
+ return URIRef(f"ip:{value}")
@@ -0,0 +1,58 @@
1
+ import json
2
+ from typing import Any, Optional, Sequence, TYPE_CHECKING
3
+ from banal import ensure_list
4
+
5
+ from followthemoney.types.common import PropertyType
6
+ from followthemoney.util import sanitize_text, defer as _
7
+
8
+ if TYPE_CHECKING:
9
+ from followthemoney.proxy import EntityProxy
10
+
11
+
12
+ class JsonType(PropertyType):
13
+ """An encoded JSON object. This is used to store raw HTTP headers for documents
14
+ and some other edge cases. It's a really bad idea and we should try to get rid
15
+ of JSON properties."""
16
+
17
+ name = "json"
18
+ group = None
19
+ label = _("Nested data")
20
+ plural = _("Nested data")
21
+ matchable = False
22
+
23
+ def pack(self, obj: Any) -> Optional[str]:
24
+ """Encode a given value to JSON."""
25
+ # TODO: use a JSON encoder that handles more types?
26
+ if obj is None:
27
+ return None
28
+ return json.dumps(obj)
29
+
30
+ def unpack(self, obj: str) -> Any:
31
+ """Decode a given JSON object."""
32
+ try:
33
+ return json.loads(obj)
34
+ except Exception:
35
+ return obj
36
+
37
+ def clean(
38
+ self,
39
+ raw: Any,
40
+ fuzzy: bool = False,
41
+ format: Optional[str] = None,
42
+ proxy: Optional["EntityProxy"] = None,
43
+ ) -> Optional[str]:
44
+ if not isinstance(raw, str):
45
+ return self.pack(raw)
46
+ else:
47
+ return sanitize_text(raw)
48
+
49
+ def join(self, values: Sequence[str]) -> str:
50
+ """Turn multiple values into a JSON array."""
51
+ values = [self.unpack(v) for v in ensure_list(values)]
52
+ data = self.pack(values)
53
+ if data is None:
54
+ return "[]"
55
+ return data
56
+
57
+ def node_id(self, value: str) -> None:
58
+ return None
@@ -1,37 +1,124 @@
1
- from rdflib import URIRef
2
- from normality import stringify
3
-
4
- from followthemoney.types.common import PropertyType
5
- from followthemoney.util import get_locale
6
-
7
-
8
- class LanguageType(PropertyType):
9
- name = 'language'
10
- group = 'languages'
11
- prefix = 'lang'
12
-
13
- def __init__(self, *args):
14
- self._names = {}
15
-
16
- @property
17
- def names(self):
18
- locale = get_locale()
19
- if locale not in self._names:
20
- self._names[locale] = {}
21
- for code, label in locale.languages.items():
22
- self._names[locale][code.lower()] = label
23
- return self._names[locale]
24
-
25
- def validate(self, text, **kwargs):
26
- text = stringify(text)
27
- if text is None:
28
- return False
29
- return text.lower() in self.names
30
-
31
- def clean_text(self, text, **kwargs):
32
- code = text.lower().strip()
33
- if code in self.names:
34
- return code
35
-
36
- def rdf(self, value):
37
- return URIRef('iso-639:%s' % value)
1
+ from typing import Optional, TYPE_CHECKING
2
+ from babel.core import Locale
3
+ from rigour.langs import iso_639_alpha3
4
+
5
+ from followthemoney.types.common import EnumType, EnumValues
6
+ from followthemoney.rdf import URIRef, Identifier
7
+ from followthemoney.util import defer as _, gettext
8
+ from followthemoney.util import get_env_list
9
+
10
+ if TYPE_CHECKING:
11
+ from followthemoney.proxy import EntityProxy
12
+
13
+
14
+ class LanguageType(EnumType):
15
+ """A human written language. This list is arbitrarily limited for some
16
+ weird upstream technical reasons, but we'll happily accept pull requests
17
+ for additional languages once there is a specific need for them to be
18
+ supported."""
19
+
20
+ name = "language"
21
+ group = "languages"
22
+ label = _("Language")
23
+ plural = _("Languages")
24
+ matchable = False
25
+ max_length = 16
26
+
27
+ # Language whitelist
28
+ LANGUAGES = [
29
+ "eng",
30
+ "fra",
31
+ "deu",
32
+ "rus",
33
+ "spa",
34
+ "nld",
35
+ "ron",
36
+ "kat",
37
+ "ara",
38
+ "tur",
39
+ "ltz",
40
+ "ell",
41
+ "lit",
42
+ "ukr",
43
+ "zho",
44
+ "bel",
45
+ "bul",
46
+ "bos",
47
+ "jpn",
48
+ "ces",
49
+ "lav",
50
+ "por",
51
+ "pol",
52
+ "hye",
53
+ "hrv",
54
+ "hin",
55
+ "heb",
56
+ "uzb",
57
+ "mon",
58
+ "urd",
59
+ "sqi",
60
+ "kor",
61
+ "isl",
62
+ "ita",
63
+ "est",
64
+ "nor",
65
+ "fas",
66
+ "swa",
67
+ "slv",
68
+ "slk",
69
+ "aze",
70
+ "tgk",
71
+ "kaz",
72
+ "tuk",
73
+ "kir",
74
+ "hun",
75
+ "dan",
76
+ "afr",
77
+ "swe",
78
+ "srp",
79
+ "ind",
80
+ "kan",
81
+ "mkd",
82
+ "mlt",
83
+ "msa",
84
+ "fin",
85
+ "cat",
86
+ "nep",
87
+ "tgl",
88
+ "fil",
89
+ "mya",
90
+ "khm",
91
+ "cnr",
92
+ ]
93
+ LANGUAGES = get_env_list("FTM_LANGUAGES", LANGUAGES)
94
+ LANGUAGES = [lang.lower().strip() for lang in LANGUAGES]
95
+
96
+ def _locale_names(self, locale: Locale) -> EnumValues:
97
+ names = {
98
+ "ara": gettext("Arabic"),
99
+ "nor": gettext("Norwegian"),
100
+ "cnr": gettext("Montenegrin"),
101
+ }
102
+ for lang in self.LANGUAGES:
103
+ if lang not in names:
104
+ names[lang] = lang
105
+ for code, label in locale.languages.items():
106
+ code = iso_639_alpha3(code)
107
+ if code in self.LANGUAGES and names[code] == code:
108
+ names[code] = label
109
+ return names
110
+
111
+ def clean_text(
112
+ self,
113
+ text: str,
114
+ fuzzy: bool = False,
115
+ format: Optional[str] = None,
116
+ proxy: Optional["EntityProxy"] = None,
117
+ ) -> Optional[str]:
118
+ code = iso_639_alpha3(text)
119
+ if code not in self.LANGUAGES:
120
+ return None
121
+ return code
122
+
123
+ def rdf(self, value: str) -> Identifier:
124
+ return URIRef(f"iso-639:{value}")
@@ -0,0 +1,44 @@
1
+ from typing import Optional, TYPE_CHECKING
2
+ from rigour.mime import normalize_mimetype, parse_mimetype
3
+ from rigour.mime import DEFAULT
4
+
5
+ from followthemoney.types.common import PropertyType
6
+ from followthemoney.rdf import URIRef, Identifier
7
+ from followthemoney.util import defer as _
8
+
9
+ if TYPE_CHECKING:
10
+ from followthemoney.proxy import EntityProxy
11
+
12
+
13
+ class MimeType(PropertyType):
14
+ """A MIME media type are a specification of a content type on a network.
15
+ Each MIME type is assigned by IANA and consists of two parts: the type
16
+ and sub-type. Common examples are: `text/plain`, `application/json` and
17
+ `application/pdf`.
18
+
19
+ MIME type properties do not contain parameters as used in HTTP headers,
20
+ like `charset=UTF-8`."""
21
+
22
+ name = "mimetype"
23
+ group = "mimetypes"
24
+ label = _("MIME-Type")
25
+ plural = _("MIME-Types")
26
+ matchable = False
27
+
28
+ def clean_text(
29
+ self,
30
+ text: str,
31
+ fuzzy: bool = False,
32
+ format: Optional[str] = None,
33
+ proxy: Optional["EntityProxy"] = None,
34
+ ) -> Optional[str]:
35
+ text = normalize_mimetype(text)
36
+ if text != DEFAULT:
37
+ return text
38
+ return None
39
+
40
+ def rdf(self, value: str) -> Identifier:
41
+ return URIRef(f"urn:mimetype:{value}")
42
+
43
+ def caption(self, value: str) -> str:
44
+ return parse_mimetype(value).label or value
@@ -1,24 +1,68 @@
1
- from Levenshtein import jaro_winkler
1
+ from typing import TYPE_CHECKING, Optional, Sequence
2
+ from normality import slugify
2
3
  from normality.cleaning import collapse_spaces, strip_quotes
4
+ from rigour.env import MAX_NAME_LENGTH
5
+ from rigour.names import pick_name
6
+ from rigour.text.distance import levenshtein_similarity
7
+ from fingerprints.cleanup import clean_name_light
3
8
 
4
9
  from followthemoney.types.common import PropertyType
5
10
  from followthemoney.util import dampen
11
+ from followthemoney.util import defer as _
12
+
13
+ if TYPE_CHECKING:
14
+ from followthemoney.proxy import EntityProxy
6
15
 
7
16
 
8
17
  class NameType(PropertyType):
9
- name = 'name'
10
- group = 'names'
11
- prefix = 'n'
18
+ """A name used for a person or company. This is assumed to be as complete
19
+ a name as available - when a first name, family name or patronymic are given
20
+ separately, these are stored to string-type properties instead.
21
+
22
+ No validation rules apply, and things having multiple names must be considered
23
+ a perfectly ordinary case."""
24
+
25
+ name = "name"
26
+ group = "names"
27
+ label = _("Name")
28
+ plural = _("Names")
29
+ matchable = True
30
+ pivot = True
31
+ max_length = MAX_NAME_LENGTH
12
32
 
13
- def clean_text(self, name, **kwargs):
33
+ def clean_text(
34
+ self,
35
+ text: str,
36
+ fuzzy: bool = False,
37
+ format: Optional[str] = None,
38
+ proxy: Optional["EntityProxy"] = None,
39
+ ) -> Optional[str]:
14
40
  """Basic clean-up."""
15
- name = strip_quotes(name)
16
- name = collapse_spaces(name)
17
- return name
41
+ name = strip_quotes(text)
42
+ return collapse_spaces(name)
18
43
 
19
- def specificity(self, value):
44
+ def pick(self, values: Sequence[str]) -> Optional[str]:
45
+ """From a set of names, pick the most plausible user-facing one."""
46
+ return pick_name(list(values))
47
+
48
+ def _specificity(self, value: str) -> float:
20
49
  # TODO: insert artificial intelligence here.
21
- return dampen(3, 50, value) * .8
50
+ return dampen(3, 50, value)
51
+
52
+ def compare(self, left: str, right: str) -> float:
53
+ """Compare two names for similarity."""
54
+ left_clean = clean_name_light(left)
55
+ right_clean = clean_name_light(right)
56
+ if left_clean is None or right_clean is None:
57
+ return 0.0
58
+ return levenshtein_similarity(
59
+ left_clean,
60
+ right_clean,
61
+ max_length=self.max_length,
62
+ )
22
63
 
23
- def compare(self, left, right):
24
- return jaro_winkler(left, right)
64
+ def node_id(self, value: str) -> Optional[str]:
65
+ slug = slugify(value)
66
+ if slug is None:
67
+ return None
68
+ return f"name:{slug}"
@@ -0,0 +1,30 @@
1
+ import re
2
+ from typing import Optional
3
+
4
+ from followthemoney.types.common import PropertyType
5
+ from followthemoney.util import defer as _
6
+
7
+
8
+ class NumberType(PropertyType):
9
+ """A numeric value, like the size of a piece of land, or the value of a
10
+ contract. Since all property values in FtM are strings, this is also a
11
+ string and there is no specified format (e.g. `1,000.00` vs. `1.000,00`).
12
+
13
+ In the future we might want to enable annotations for format, units, or
14
+ even to introduce a separate property type for monetary values."""
15
+
16
+ CAST_RE = re.compile(r"[^0-9\-\.]")
17
+ name = "number"
18
+ label = _("Number")
19
+ plural = _("Numbers")
20
+ matchable = False
21
+
22
+ def node_id(self, value: str) -> None:
23
+ return None
24
+
25
+ def to_number(self, value: str) -> Optional[float]:
26
+ try:
27
+ value = self.CAST_RE.sub("", value)
28
+ return float(value)
29
+ except Exception:
30
+ return None