pragmatic_segmenter_server 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 158a44e00c5a1ab64d3fb77001d48286637ed30cacb0d7098fb0077f311223ed
4
- data.tar.gz: e78e2cc010710a465454c0bc339dd26edcde0f2da6637eccbc270c40fde3f78f
3
+ metadata.gz: 9d32be051c0f866fa93fb7c5b25aab2949f704ea009fb374d8aa4356c7dbd2a8
4
+ data.tar.gz: b459035e2986c2383962836a705a36e75e933f879f90f45ca9055dce19e6a8af
5
5
  SHA512:
6
- metadata.gz: 680f9f950efc494262237e9f0a604c01fec37b60272f48aa70cfe8f413475f4d9c4dcc435515431fddb4d756b40069ada85b1910f82c6a4e7b1d700fd132a069
7
- data.tar.gz: 1b2221423b6ece5c40e066f61180502049a645aa3b5148a9fad325cc723e031234635cddd92399bed5da1708e40934596e0ef51ba39a3058cb536b358566a83e
6
+ metadata.gz: 46b4d760bea4ddcf94762aadb0eebcb0c38cefbab26010f74006e812c028715ce54a3193790e2367f2e79ef121fa65a1e5f3f7fa0f5e7f010e6ac9797d2e32aa
7
+ data.tar.gz: 7cf14771e1c810470c9fb82024353cf9e386a4c093ee3473689a812b039ccfd4af458b33f773c29e5a69a9b5046207ab14e6a7437e0d9a8f7a250f4fdbf30aea
@@ -2,8 +2,10 @@ require 'pragmatic_segmenter'
2
2
 
3
3
 
4
4
  def segment(text, lang, segmentByNewline)
5
-
6
- newLineRegex = /((?: *[\n\r\t]+ *)+)/
5
+
6
+ # Create parts
7
+ newLineRegex = /(?:( *[\n\r\t]+ *)+|( {3,}))/
8
+
7
9
  mask = ''
8
10
  segments = []
9
11
 
@@ -14,20 +16,40 @@ def segment(text, lang, segmentByNewline)
14
16
  end
15
17
 
16
18
  textParts.each do |textPart|
19
+
17
20
  if segmentByNewline && textPart.match(newLineRegex)
18
21
  mask += textPart
22
+ puts("Sep:" + textPart.inspect)
19
23
  else
24
+ puts("texto:" + textPart )
20
25
  if textPart == ""
21
26
  next
22
27
  end
28
+
23
29
  ps = PragmaticSegmenter::Segmenter.new(text: textPart, language: lang, clean:false)
30
+
31
+
24
32
  ps.segment.each do |segment|
25
33
  segments.push(segment)
26
- end
27
- mask += textPart.gsub(Regexp.new(ps.segment.map { |string| Regexp.escape(string) }.join("|")), "{}")
28
-
34
+
35
+ end
36
+
37
+ # Created mask
38
+ isCleaned=false # Sometimes pragmatic "cleans" the text
39
+ ps.segment.each do |segment|
40
+ if not textPart.include? segment
41
+ isCleaned=true
42
+ break
43
+ end
44
+ end
45
+ if isCleaned
46
+ placeHolders = []
47
+ ps.segment.length.times { placeHolders << "{}" }
48
+ mask += placeHolders.join(" ")
49
+ else
50
+ mask += textPart.gsub(Regexp.new(ps.segment.map { |string| Regexp.escape(string) }.join("|")), "{}")
51
+ end
29
52
  end
30
-
31
53
  end
32
54
 
33
55
  return segments, mask
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter_server
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Laurent Bié
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-27 00:00:00.000000000 Z
11
+ date: 2023-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: sinatra
@@ -93,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
93
  - !ruby/object:Gem::Version
94
94
  version: '0'
95
95
  requirements: []
96
- rubygems_version: 3.1.2
96
+ rubygems_version: 3.3.5
97
97
  signing_key:
98
98
  specification_version: 4
99
99
  summary: A server for pragmatic segmenter