pragmatic_segmenter_server 0.0.7 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 004a705fc7f50fa65c8b5a99af899659baad350ffbe8992d6b929fccf2fc6195
4
- data.tar.gz: a713bffdbed2ae5a80fd7481325d75d9e9e78fe47259268fedd5f8c18ff1e8e5
3
+ metadata.gz: 9d32be051c0f866fa93fb7c5b25aab2949f704ea009fb374d8aa4356c7dbd2a8
4
+ data.tar.gz: b459035e2986c2383962836a705a36e75e933f879f90f45ca9055dce19e6a8af
5
5
  SHA512:
6
- metadata.gz: 24286ee9393591d3c216a12eff44bd68a498db13af431cf187d208efe1491997512b558a265ec2e1a64b3f4eb07b755e90141691d3bf6d89865cae8cfeeea7f9
7
- data.tar.gz: e82df4506646404fcff2d5d637ed4062f94393b8d2b1e68c2b99be304c1a7ac46cc9ac416bb5669bc02adbb443aeb2caae8ad271cf7eb64d67f737063371cf6f
6
+ metadata.gz: 46b4d760bea4ddcf94762aadb0eebcb0c38cefbab26010f74006e812c028715ce54a3193790e2367f2e79ef121fa65a1e5f3f7fa0f5e7f010e6ac9797d2e32aa
7
+ data.tar.gz: 7cf14771e1c810470c9fb82024353cf9e386a4c093ee3473689a812b039ccfd4af458b33f773c29e5a69a9b5046207ab14e6a7437e0d9a8f7a250f4fdbf30aea
@@ -2,8 +2,10 @@ require 'pragmatic_segmenter'
2
2
 
3
3
 
4
4
  def segment(text, lang, segmentByNewline)
5
-
6
- newLineRegex = /((?: *[\n\r\t]+ *)+)/
5
+
6
+ # Create parts
7
+ newLineRegex = /(?:( *[\n\r\t]+ *)+|( {3,}))/
8
+
7
9
  mask = ''
8
10
  segments = []
9
11
 
@@ -14,20 +16,40 @@ def segment(text, lang, segmentByNewline)
14
16
  end
15
17
 
16
18
  textParts.each do |textPart|
19
+
17
20
  if segmentByNewline && textPart.match(newLineRegex)
18
21
  mask += textPart
22
+ puts("Sep:" + textPart.inspect)
19
23
  else
24
+ puts("texto:" + textPart )
20
25
  if textPart == ""
21
26
  next
22
27
  end
28
+
23
29
  ps = PragmaticSegmenter::Segmenter.new(text: textPart, language: lang, clean:false)
30
+
31
+
24
32
  ps.segment.each do |segment|
25
33
  segments.push(segment)
26
- end
27
- mask += textPart.gsub(Regexp.new(ps.segment.map { |string| Regexp.escape(string) }.join("|")), "{}")
28
-
34
+
35
+ end
36
+
37
+ # Created mask
38
+ isCleaned=false # Sometimes pragmatic "cleans" the text
39
+ ps.segment.each do |segment|
40
+ if not textPart.include? segment
41
+ isCleaned=true
42
+ break
43
+ end
44
+ end
45
+ if isCleaned
46
+ placeHolders = []
47
+ ps.segment.length.times { placeHolders << "{}" }
48
+ mask += placeHolders.join(" ")
49
+ else
50
+ mask += textPart.gsub(Regexp.new(ps.segment.map { |string| Regexp.escape(string) }.join("|")), "{}")
51
+ end
29
52
  end
30
-
31
53
  end
32
54
 
33
55
  return segments, mask
@@ -65,4 +65,5 @@ post '/segment' do
65
65
  end
66
66
 
67
67
  get '/healthcheck' do
68
- 'ok'
68
+ 'ok'
69
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_segmenter_server
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Laurent Bié
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-27 00:00:00.000000000 Z
11
+ date: 2023-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: sinatra
@@ -93,7 +93,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
93
  - !ruby/object:Gem::Version
94
94
  version: '0'
95
95
  requirements: []
96
- rubygems_version: 3.1.2
96
+ rubygems_version: 3.3.5
97
97
  signing_key:
98
98
  specification_version: 4
99
99
  summary: A server for pragmatic segmenter