pragmatic_segmenter_server 0.0.7 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pragmatic_segmenter_server/segment.rb +28 -6
- data/lib/pragmatic_segmenter_server.rb +2 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9d32be051c0f866fa93fb7c5b25aab2949f704ea009fb374d8aa4356c7dbd2a8
|
4
|
+
data.tar.gz: b459035e2986c2383962836a705a36e75e933f879f90f45ca9055dce19e6a8af
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 46b4d760bea4ddcf94762aadb0eebcb0c38cefbab26010f74006e812c028715ce54a3193790e2367f2e79ef121fa65a1e5f3f7fa0f5e7f010e6ac9797d2e32aa
|
7
|
+
data.tar.gz: 7cf14771e1c810470c9fb82024353cf9e386a4c093ee3473689a812b039ccfd4af458b33f773c29e5a69a9b5046207ab14e6a7437e0d9a8f7a250f4fdbf30aea
|
@@ -2,8 +2,10 @@ require 'pragmatic_segmenter'
|
|
2
2
|
|
3
3
|
|
4
4
|
def segment(text, lang, segmentByNewline)
|
5
|
-
|
6
|
-
|
5
|
+
|
6
|
+
# Create parts
|
7
|
+
newLineRegex = /(?:( *[\n\r\t]+ *)+|( {3,}))/
|
8
|
+
|
7
9
|
mask = ''
|
8
10
|
segments = []
|
9
11
|
|
@@ -14,20 +16,40 @@ def segment(text, lang, segmentByNewline)
|
|
14
16
|
end
|
15
17
|
|
16
18
|
textParts.each do |textPart|
|
19
|
+
|
17
20
|
if segmentByNewline && textPart.match(newLineRegex)
|
18
21
|
mask += textPart
|
22
|
+
puts("Sep:" + textPart.inspect)
|
19
23
|
else
|
24
|
+
puts("texto:" + textPart )
|
20
25
|
if textPart == ""
|
21
26
|
next
|
22
27
|
end
|
28
|
+
|
23
29
|
ps = PragmaticSegmenter::Segmenter.new(text: textPart, language: lang, clean:false)
|
30
|
+
|
31
|
+
|
24
32
|
ps.segment.each do |segment|
|
25
33
|
segments.push(segment)
|
26
|
-
|
27
|
-
|
28
|
-
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
# Created mask
|
38
|
+
isCleaned=false # Sometimes pragmatic "cleans" the text
|
39
|
+
ps.segment.each do |segment|
|
40
|
+
if not textPart.include? segment
|
41
|
+
isCleaned=true
|
42
|
+
break
|
43
|
+
end
|
44
|
+
end
|
45
|
+
if isCleaned
|
46
|
+
placeHolders = []
|
47
|
+
ps.segment.length.times { placeHolders << "{}" }
|
48
|
+
mask += placeHolders.join(" ")
|
49
|
+
else
|
50
|
+
mask += textPart.gsub(Regexp.new(ps.segment.map { |string| Regexp.escape(string) }.join("|")), "{}")
|
51
|
+
end
|
29
52
|
end
|
30
|
-
|
31
53
|
end
|
32
54
|
|
33
55
|
return segments, mask
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_segmenter_server
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Laurent Bié
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-05-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: sinatra
|
@@ -93,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
93
93
|
- !ruby/object:Gem::Version
|
94
94
|
version: '0'
|
95
95
|
requirements: []
|
96
|
-
rubygems_version: 3.
|
96
|
+
rubygems_version: 3.3.5
|
97
97
|
signing_key:
|
98
98
|
specification_version: 4
|
99
99
|
summary: A server for pragmatic segmenter
|