pragmatic_segmenter_server 0.0.8 → 0.0.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 158a44e00c5a1ab64d3fb77001d48286637ed30cacb0d7098fb0077f311223ed
4
- data.tar.gz: e78e2cc010710a465454c0bc339dd26edcde0f2da6637eccbc270c40fde3f78f
3
+ metadata.gz: 691daea57ce5a11eea96376091386612365d906b6fea70efa0db9b1233a0881f
4
+ data.tar.gz: a8bd66465a670f0b103740e113489fb047879bf26d6b1275c51b2f6efec1f724
5
5
  SHA512:
6
- metadata.gz: 680f9f950efc494262237e9f0a604c01fec37b60272f48aa70cfe8f413475f4d9c4dcc435515431fddb4d756b40069ada85b1910f82c6a4e7b1d700fd132a069
7
- data.tar.gz: 1b2221423b6ece5c40e066f61180502049a645aa3b5148a9fad325cc723e031234635cddd92399bed5da1708e40934596e0ef51ba39a3058cb536b358566a83e
6
+ metadata.gz: f374217088e3f83498d3cff70f655c99947fc66f06ff76f8665d7e2f93e9428e5e78f7556d978f1f0aef99b17ff8dc55172af889c2d95c9f75022886a787634f
7
+ data.tar.gz: c741cce30fc4e4041f722545834a27b29a69d331e6c0c15f66ae328c49bb461a818a9eec82a90bbf27c0822254da72e82edf8d7e2ec4fb48e741ecb0d2e93753
@@ -2,8 +2,10 @@ require 'pragmatic_segmenter'
2
2
 
3
3
 
4
4
  def segment(text, lang, segmentByNewline)
5
-
6
- newLineRegex = /((?: *[\n\r\t]+ *)+)/
5
+
6
+ # Create parts
7
+ newLineRegex = /(?:( *[\n\r\t]+ *)+|( {3,}))/
8
+
7
9
  mask = ''
8
10
  segments = []
9
11
 
@@ -14,20 +16,40 @@ def segment(text, lang, segmentByNewline)
14
16
  end
15
17
 
16
18
  textParts.each do |textPart|
19
+
17
20
  if segmentByNewline && textPart.match(newLineRegex)
18
21
  mask += textPart
22
+ # puts("Sep:" + textPart.inspect)
19
23
  else
24
+ # puts("texto:" + textPart )
20
25
  if textPart == ""
21
26
  next
22
27
  end
28
+
23
29
  ps = PragmaticSegmenter::Segmenter.new(text: textPart, language: lang, clean:false)
30
+
31
+
24
32
  ps.segment.each do |segment|
25
33
  segments.push(segment)
26
- end
27
- mask += textPart.gsub(Regexp.new(ps.segment.map { |string| Regexp.escape(string) }.join("|")), "{}")
28
-
34
+
35
+ end
36
+
37
+ # Created mask
38
+ isCleaned=false # Sometimes pragmatic "cleans" the text
39
+ ps.segment.each do |segment|
40
+ if not textPart.include? segment
41
+ isCleaned=true
42
+ break
43
+ end
44
+ end
45
+ if isCleaned
46
+ placeHolders = []
47
+ ps.segment.length.times { placeHolders << "{}" }
48
+ mask += placeHolders.join(" ")
49
+ else
50
+ mask += textPart.gsub(Regexp.new(ps.segment.map { |string| Regexp.escape(string) }.join("|")), "{}")
51
+ end
29
52
  end
30
-
31
53
  end
32
54
 
33
55
  return segments, mask
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter_server
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Laurent Bié
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-27 00:00:00.000000000 Z
11
+ date: 2023-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: sinatra
@@ -93,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
93
  - !ruby/object:Gem::Version
94
94
  version: '0'
95
95
  requirements: []
96
- rubygems_version: 3.1.2
96
+ rubygems_version: 3.3.5
97
97
  signing_key:
98
98
  specification_version: 4
99
99
  summary: A server for pragmatic segmenter