pragmatic_segmenter_server 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 158a44e00c5a1ab64d3fb77001d48286637ed30cacb0d7098fb0077f311223ed
4
- data.tar.gz: e78e2cc010710a465454c0bc339dd26edcde0f2da6637eccbc270c40fde3f78f
3
+ metadata.gz: 9d32be051c0f866fa93fb7c5b25aab2949f704ea009fb374d8aa4356c7dbd2a8
4
+ data.tar.gz: b459035e2986c2383962836a705a36e75e933f879f90f45ca9055dce19e6a8af
5
5
  SHA512:
6
- metadata.gz: 680f9f950efc494262237e9f0a604c01fec37b60272f48aa70cfe8f413475f4d9c4dcc435515431fddb4d756b40069ada85b1910f82c6a4e7b1d700fd132a069
7
- data.tar.gz: 1b2221423b6ece5c40e066f61180502049a645aa3b5148a9fad325cc723e031234635cddd92399bed5da1708e40934596e0ef51ba39a3058cb536b358566a83e
6
+ metadata.gz: 46b4d760bea4ddcf94762aadb0eebcb0c38cefbab26010f74006e812c028715ce54a3193790e2367f2e79ef121fa65a1e5f3f7fa0f5e7f010e6ac9797d2e32aa
7
+ data.tar.gz: 7cf14771e1c810470c9fb82024353cf9e386a4c093ee3473689a812b039ccfd4af458b33f773c29e5a69a9b5046207ab14e6a7437e0d9a8f7a250f4fdbf30aea
@@ -2,8 +2,10 @@ require 'pragmatic_segmenter'
2
2
 
3
3
 
4
4
  def segment(text, lang, segmentByNewline)
5
-
6
- newLineRegex = /((?: *[\n\r\t]+ *)+)/
5
+
6
+ # Create parts
7
+ newLineRegex = /(?:( *[\n\r\t]+ *)+|( {3,}))/
8
+
7
9
  mask = ''
8
10
  segments = []
9
11
 
@@ -14,20 +16,40 @@ def segment(text, lang, segmentByNewline)
14
16
  end
15
17
 
16
18
  textParts.each do |textPart|
19
+
17
20
  if segmentByNewline && textPart.match(newLineRegex)
18
21
  mask += textPart
22
+ puts("Sep:" + textPart.inspect)
19
23
  else
24
+ puts("texto:" + textPart )
20
25
  if textPart == ""
21
26
  next
22
27
  end
28
+
23
29
  ps = PragmaticSegmenter::Segmenter.new(text: textPart, language: lang, clean:false)
30
+
31
+
24
32
  ps.segment.each do |segment|
25
33
  segments.push(segment)
26
- end
27
- mask += textPart.gsub(Regexp.new(ps.segment.map { |string| Regexp.escape(string) }.join("|")), "{}")
28
-
34
+
35
+ end
36
+
37
+ # Created mask
38
+ isCleaned=false # Sometimes pragmatic "cleans" the text
39
+ ps.segment.each do |segment|
40
+ if not textPart.include? segment
41
+ isCleaned=true
42
+ break
43
+ end
44
+ end
45
+ if isCleaned
46
+ placeHolders = []
47
+ ps.segment.length.times { placeHolders << "{}" }
48
+ mask += placeHolders.join(" ")
49
+ else
50
+ mask += textPart.gsub(Regexp.new(ps.segment.map { |string| Regexp.escape(string) }.join("|")), "{}")
51
+ end
29
52
  end
30
-
31
53
  end
32
54
 
33
55
  return segments, mask
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter_server
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Laurent Bié
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-27 00:00:00.000000000 Z
11
+ date: 2023-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: sinatra
@@ -93,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
93
  - !ruby/object:Gem::Version
94
94
  version: '0'
95
95
  requirements: []
96
- rubygems_version: 3.1.2
96
+ rubygems_version: 3.3.5
97
97
  signing_key:
98
98
  specification_version: 4
99
99
  summary: A server for pragmatic segmenter