orthotypo 0.5.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +6 -1
- data/README.md +2 -2
- data/lib/orthotypo/analyzer.rb +18 -0
- data/lib/orthotypo/composer.rb +72 -10
- data/lib/orthotypo/version.rb +1 -1
- data/lib/orthotypo.rb +3 -0
- data/orthotypo.gemspec +1 -0
- data/spec/analyzer_spec.rb +8 -0
- data/spec/composer/fr_spec.rb +11 -1
- data/spec/orthotypo_spec.rb +10 -0
- metadata +18 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fd24cbdb2c5adaa4f6ecf91acf144025549b7599aeb0f944e3f1987331448450
|
4
|
+
data.tar.gz: 6d9a7bb478c4cc0b76722128a1065fc99666f7d1f284cc82cb6f26acfe89bf70
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e51b243378c8d97df6fd3bfa7f59a7aeb7f516d3f54d82cce4eb0ae8d3c7b6a5893211c857c72f8c249a73e0333f3d187ddf4d6145a45ab6de5039633aeaf457
|
7
|
+
data.tar.gz: dc77c1043a6210122c48a8d5849d63df04247d3d957707469d9e3029ed0cf1a9bec745cd12115bb4563254abc6e227c74e13f02285fc0cada9a0d75ab0be2ad6
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
orthotypo (0.
|
4
|
+
orthotypo (1.0.0)
|
5
5
|
htmlentities
|
6
|
+
nokogiri
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
@@ -14,6 +15,10 @@ GEM
|
|
14
15
|
htmlentities (4.3.4)
|
15
16
|
json (2.6.3)
|
16
17
|
language_server-protocol (3.17.0.3)
|
18
|
+
nokogiri (1.15.4-arm64-darwin)
|
19
|
+
racc (~> 1.4)
|
20
|
+
nokogiri (1.15.4-x86_64-darwin)
|
21
|
+
racc (~> 1.4)
|
17
22
|
parallel (1.23.0)
|
18
23
|
parser (3.2.2.4)
|
19
24
|
ast (~> 2.4.1)
|
data/README.md
CHANGED
@@ -28,10 +28,10 @@ Ajout d'espace fine insécable avant les signes doubles en français.
|
|
28
28
|
## Roadmap
|
29
29
|
|
30
30
|
### v1
|
31
|
-
|
32
|
-
2. Ne pas endommager l'HTML et les HTML entities ( )
|
31
|
+
Ne pas endommager l'HTML et les HTML entities ( )
|
33
32
|
|
34
33
|
### v2
|
34
|
+
S'adapter aux locales (détecter I18n)
|
35
35
|
Permettre les configs
|
36
36
|
|
37
37
|
## Sources
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Orthotypo
|
2
|
+
class Analyzer
|
3
|
+
|
4
|
+
def self.url?(string)
|
5
|
+
(string =~ /\A#{URI::DEFAULT_PARSER.regexp[:ABS_URI]}\z/) &&
|
6
|
+
(string =~ /\A(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?\z/ix) ? true : false
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.email?(string)
|
10
|
+
string =~ /\A#{URI::MailTo::EMAIL_REGEXP}\z/ ? true : false
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.precious?(string)
|
14
|
+
email?(string) || url?(string)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
data/lib/orthotypo/composer.rb
CHANGED
@@ -5,6 +5,7 @@ module Orthotypo
|
|
5
5
|
SPACE = ' '.freeze
|
6
6
|
NBSP = ' '.freeze
|
7
7
|
NNBSP = ' '.freeze
|
8
|
+
PRECIOUS_TOKEN = 'orthotypopreciousthing'
|
8
9
|
|
9
10
|
def initialize(string, html: nil)
|
10
11
|
@string = string
|
@@ -18,6 +19,12 @@ module Orthotypo
|
|
18
19
|
[]
|
19
20
|
end
|
20
21
|
|
22
|
+
def chars_with_space_before_after_digit
|
23
|
+
[
|
24
|
+
'%'
|
25
|
+
]
|
26
|
+
end
|
27
|
+
|
21
28
|
def chars_with_space_after
|
22
29
|
[
|
23
30
|
',',
|
@@ -41,6 +48,22 @@ module Orthotypo
|
|
41
48
|
[]
|
42
49
|
end
|
43
50
|
|
51
|
+
def chars_with_no_space_around_between_digits
|
52
|
+
[
|
53
|
+
'/',
|
54
|
+
':'
|
55
|
+
]
|
56
|
+
end
|
57
|
+
|
58
|
+
def chars_in_numbers
|
59
|
+
[
|
60
|
+
'.',
|
61
|
+
',',
|
62
|
+
'/',
|
63
|
+
':'
|
64
|
+
]
|
65
|
+
end
|
66
|
+
|
44
67
|
def is_html?
|
45
68
|
# TODO contains tags?
|
46
69
|
@html || contains_html_entities?
|
@@ -52,17 +75,21 @@ module Orthotypo
|
|
52
75
|
|
53
76
|
def prepare_ortho
|
54
77
|
@ortho = string.dup
|
55
|
-
@ortho = html_entities.decode(@ortho) if contains_html_entities?
|
78
|
+
# @ortho = html_entities.decode(@ortho) if contains_html_entities?
|
79
|
+
@nokogiri = Nokogiri::HTML.fragment @ortho
|
56
80
|
end
|
57
81
|
|
58
82
|
def clean_ortho
|
59
|
-
@ortho =
|
83
|
+
@ortho = @nokogiri.to_s
|
84
|
+
# @ortho = html_entities.encode(@ortho) if contains_html_entities?
|
60
85
|
end
|
61
86
|
|
62
87
|
def parse
|
63
88
|
prepare_ortho
|
89
|
+
preserve_precious_things
|
64
90
|
# Chars
|
65
91
|
parse_chars_with_space_before
|
92
|
+
parse_chars_with_space_before_after_digit
|
66
93
|
parse_chars_with_space_after
|
67
94
|
parse_chars_with_space_around
|
68
95
|
parse_chars_with_no_space_around
|
@@ -70,17 +97,47 @@ module Orthotypo
|
|
70
97
|
parse_pairs_with_space_around
|
71
98
|
parse_pairs_with_no_space_around
|
72
99
|
# Numbers
|
73
|
-
|
100
|
+
parse_chars_in_numbers
|
74
101
|
#
|
75
102
|
clean_ortho
|
103
|
+
restore_precious_things
|
104
|
+
end
|
105
|
+
|
106
|
+
def preserve_precious_things
|
107
|
+
@precious_things = []
|
108
|
+
@nokogiri.traverse do |node|
|
109
|
+
next unless node.text?
|
110
|
+
new_content = node.content.split(SPACE).map { |fragment|
|
111
|
+
if Analyzer::precious?(fragment)
|
112
|
+
token = "#{PRECIOUS_TOKEN}#{@precious_things.length}"
|
113
|
+
@precious_things << fragment
|
114
|
+
token
|
115
|
+
else
|
116
|
+
fragment
|
117
|
+
end
|
118
|
+
}.join(SPACE)
|
119
|
+
node.content = new_content
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def restore_precious_things
|
124
|
+
@precious_things.each_with_index do |value, index|
|
125
|
+
@ortho.gsub! "#{PRECIOUS_TOKEN}#{index}", value
|
126
|
+
end
|
76
127
|
end
|
77
128
|
|
78
129
|
def parse_chars_with_space_before
|
79
130
|
chars_with_space_before.each do |char|
|
80
131
|
# Espace normal avant -> espace fine insécable avant
|
81
|
-
fix(SPACE +
|
132
|
+
fix(SPACE + char, NNBSP + char)
|
82
133
|
# Pas d'espace avant -> espace fine insécable avant
|
83
|
-
fix(/([[:
|
134
|
+
fix(/([[:alpha:]])[#{char}]/, "\\1" + NNBSP + char)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def parse_chars_with_space_before_after_digit
|
139
|
+
chars_with_space_before_after_digit.each do |char|
|
140
|
+
fix(/([[:digit:]])[#{char}]/, "\\1" + NNBSP + char)
|
84
141
|
end
|
85
142
|
end
|
86
143
|
|
@@ -89,7 +146,7 @@ module Orthotypo
|
|
89
146
|
# Espace avant -> pas d'espace avant
|
90
147
|
fix(SPACE + char, char)
|
91
148
|
# Pas d'espace après -> espace après
|
92
|
-
fix(/[#{char}]([[:
|
149
|
+
fix(/[#{char}]([[:alpha:]])/, char + SPACE + "\\1")
|
93
150
|
end
|
94
151
|
end
|
95
152
|
|
@@ -98,7 +155,7 @@ module Orthotypo
|
|
98
155
|
# Espace normal avant -> espace fine insécable avant
|
99
156
|
fix(SPACE + char, NNBSP + char)
|
100
157
|
# Pas d'espace avant -> espace fine insécable avant
|
101
|
-
fix(/([[:
|
158
|
+
fix(/([[:alpha:]])[#{char}]/, "\\1" + NNBSP + char)
|
102
159
|
end
|
103
160
|
end
|
104
161
|
|
@@ -134,8 +191,10 @@ module Orthotypo
|
|
134
191
|
end
|
135
192
|
end
|
136
193
|
|
137
|
-
def
|
138
|
-
|
194
|
+
def parse_chars_in_numbers
|
195
|
+
chars_in_numbers.each do |char|
|
196
|
+
fix(/([[:digit:]])[[:space:]][#{char}]([[:digit:]])/, "\\1" + char + "\\2")
|
197
|
+
fix(/([[:digit:]])[[:space:]][#{char}][[:space:]]([[:digit:]])/, "\\1" + char + "\\2")
|
139
198
|
fix(/([[:digit:]])[#{char}][[:space:]]([[:digit:]])/, "\\1" + char + "\\2")
|
140
199
|
end
|
141
200
|
end
|
@@ -145,7 +204,10 @@ module Orthotypo
|
|
145
204
|
end
|
146
205
|
|
147
206
|
def fix(bad, good)
|
148
|
-
@
|
207
|
+
@nokogiri.traverse do |node|
|
208
|
+
next unless node.text?
|
209
|
+
node.content = node.content.gsub(bad, good)
|
210
|
+
end
|
149
211
|
end
|
150
212
|
end
|
151
213
|
end
|
data/lib/orthotypo/version.rb
CHANGED
data/lib/orthotypo.rb
CHANGED
data/orthotypo.gemspec
CHANGED
data/spec/composer/fr_spec.rb
CHANGED
@@ -5,7 +5,8 @@ describe Orthotypo::Composer::Fr do
|
|
5
5
|
it 'adds spaces before double punctuation marks' do
|
6
6
|
expect("mot: suite".ortho).to(eq("mot : suite"))
|
7
7
|
expect("é: suite".ortho).to(eq("é : suite"))
|
8
|
-
|
8
|
+
# Pas automatisable, parce que 11:20
|
9
|
+
# expect("1: suite".ortho).to(eq("1 : suite"))
|
9
10
|
expect("mot; suite".ortho).to(eq("mot ; suite"))
|
10
11
|
expect("mot!".ortho).to(eq("mot !"))
|
11
12
|
expect("mot !".ortho).to(eq("mot !"))
|
@@ -44,6 +45,14 @@ describe Orthotypo::Composer::Fr do
|
|
44
45
|
expect("10 %".ortho).to(eq("10 %"))
|
45
46
|
end
|
46
47
|
|
48
|
+
it 'fixes dates/time' do
|
49
|
+
expect("10/01/2023 16:00".ortho).to(eq("10/01/2023 16:00"))
|
50
|
+
expect("10/01/2023 16:00:00".ortho).to(eq("10/01/2023 16:00:00"))
|
51
|
+
expect("10 / 01 / 2023 16:00".ortho).to(eq("10/01/2023 16:00"))
|
52
|
+
expect("10 / 01 / 2023 16 : 00".ortho).to(eq("10/01/2023 16:00"))
|
53
|
+
expect("10 octobre 2023 16:00".ortho).to(eq("10 octobre 2023 16:00"))
|
54
|
+
end
|
55
|
+
|
47
56
|
# https://www.scribbr.fr/elements-linguistiques/les-espaces/
|
48
57
|
it 'tests de Justine Debret' do
|
49
58
|
expect("Elle a vu son cousin,sa tante et son oncle.Ils allaient tous très bien.".ortho).to(eq("Elle a vu son cousin, sa tante et son oncle. Ils allaient tous très bien."))
|
@@ -53,6 +62,7 @@ describe Orthotypo::Composer::Fr do
|
|
53
62
|
expect("Il a dit : «J’arrive ce matin ( ou plus tard ) à Paris [ rue de la République ] pour son anniversaire.»".ortho).to(eq("Il a dit : « J’arrive ce matin (ou plus tard) à Paris [rue de la République] pour son anniversaire. »"))
|
54
63
|
# Le test suivant n'est pas automatisable, parce qu'on ne peut distinguer un Paris-Brest (le gâteau) d'un Paris - Brest (le trajet)
|
55
64
|
# expect("Nous l’avons rencontré à Saint - Martin.".ortho).to(eq("Nous l’avons rencontré à Saint-Martin."))
|
65
|
+
expect("Il roule pendant 31, 5 km.".ortho).to(eq("Il roule pendant 31,5 km."))
|
56
66
|
# Le test suivant est-il automatisable ?
|
57
67
|
# expect("Il roule pendant 31, 5km.".ortho).to(eq("Il roule pendant 31,5 km."))
|
58
68
|
# Pas automatisable, rien ne permet de distinguer s'il s'agit d'un rang ou d'un nombre
|
data/spec/orthotypo_spec.rb
CHANGED
@@ -8,4 +8,14 @@ describe Orthotypo do
|
|
8
8
|
it 'leaves html tags untouched' do
|
9
9
|
expect("<b>bold</b>".ortho).to eq "<b>bold</b>"
|
10
10
|
end
|
11
|
+
|
12
|
+
it 'leaves URLs untouched' do
|
13
|
+
expect("https://unsplash.com/@lusvardi?utm_source=osuny".ortho).to eq "https://unsplash.com/@lusvardi?utm_source=osuny"
|
14
|
+
expect("<a href=\"https://unsplash.com/@lusvardi?utm_source=osuny\">https://unsplash.com/@lusvardi?utm_source=osuny</a>".ortho).to eq "<a href=\"https://unsplash.com/@lusvardi?utm_source=osuny\">https://unsplash.com/@lusvardi?utm_source=osuny</a>"
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'leaves URLs untouched' do
|
18
|
+
expect("prenom.nom@example.com".ortho).to eq "prenom.nom@example.com"
|
19
|
+
expect("<a href=\"mailto:prenom.nom@example.com\">prenom.nom@example.com</a>".ortho).to eq "<a href=\"mailto:prenom.nom@example.com\">prenom.nom@example.com</a>"
|
20
|
+
end
|
11
21
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: orthotypo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: rspec
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -87,6 +101,7 @@ files:
|
|
87
101
|
- bin/console
|
88
102
|
- bin/setup
|
89
103
|
- lib/orthotypo.rb
|
104
|
+
- lib/orthotypo/analyzer.rb
|
90
105
|
- lib/orthotypo/composer.rb
|
91
106
|
- lib/orthotypo/composer/en.rb
|
92
107
|
- lib/orthotypo/composer/en_gb.rb
|
@@ -98,6 +113,7 @@ files:
|
|
98
113
|
- lib/orthotypo/version.rb
|
99
114
|
- orthotypo.gemspec
|
100
115
|
- sig/orthotypo.rbs
|
116
|
+
- spec/analyzer_spec.rb
|
101
117
|
- spec/composer/en_spec.rb
|
102
118
|
- spec/composer/fr_spec.rb
|
103
119
|
- spec/localizer_spec.rb
|