pragmatic_segmenter_server 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 54321133f0512e94bc8df32b823780820dd57569
4
+ data.tar.gz: 595f906c6115a8cded9521529a66b4d2cca7c280
5
+ SHA512:
6
+ metadata.gz: f92cf7292c893d7cb3ddc053a176007e9fe72022b7b6557e841e853032ccec9a5c7a8a495d2c714762c96617838d6a51e9099235af3c37fbaef651f8b7dc94d8
7
+ data.tar.gz: 4fb69b43f5f9af07e611038437e877706aece47b8c155c8dc4dd20e7463895b41812c574732199372d7bc19bcf19f9adaf2eaf0b7876ab3146118db12552cc07
@@ -0,0 +1,28 @@
1
+ require 'pragmatic_segmenter'
2
+
3
+
4
+ def segment(text, lang, segmentByNewline)
5
+ newLineRegex = /((?: *[\n\r\t]+ *)+)/
6
+ mask = ''
7
+ segments = []
8
+
9
+ if segmentByNewline
10
+ textParts = text.split(newLineRegex)
11
+ else
12
+ textParts = [text]
13
+ end
14
+
15
+ textParts.each do |textPart|
16
+ if segmentByNewline && textPart.match(newLineRegex)
17
+ mask += textPart
18
+ else
19
+ ps = PragmaticSegmenter::Segmenter.new(text: textPart, language: lang, clean:false)
20
+ ps.segment.each do |segment|
21
+ segments.push(segment)
22
+ end
23
+ mask += textPart.gsub(Regexp.new(ps.segment.map { |string| Regexp.escape(string) }.join("|")), "{}")
24
+ end
25
+ end
26
+
27
+ return segments, mask
28
+ end
@@ -0,0 +1,65 @@
1
+ require 'sinatra'
2
+ require 'json'
3
+ require 'json-schema'
4
+ require_relative 'segment'
5
+
6
+
7
+ post '/segment' do
8
+ content_type :json
9
+
10
+ # Input
11
+ input = JSON.parse(request.body.read)
12
+
13
+ schema = {
14
+ "type" => "object",
15
+ "required" => ["texts", "lang"],
16
+ "properties" => {
17
+ "texts" => {
18
+ "type" => "array",
19
+ "items": {
20
+ "type": "string",
21
+ "minItems": 1
22
+ }
23
+ },
24
+ "lang" => {
25
+ "type" => "string"
26
+ },
27
+ "segmentByNewline" => {
28
+ "type" => "boolean"
29
+ }
30
+ }
31
+ }
32
+
33
+ begin
34
+ JSON::Validator.validate!(schema, input)
35
+ rescue JSON::Schema::ValidationError => e
36
+ json_message = {"error":e.message}.to_json
37
+ status 400
38
+ body json_message
39
+ return
40
+ end
41
+
42
+ # Extract info
43
+ lang = input["lang"]
44
+ texts = input["texts"]
45
+
46
+ if !input.key?("segmentByNewline")
47
+ segmentByNewline = true
48
+ else
49
+ segmentByNewline = input["segmentByNewline"]
50
+ end
51
+
52
+
53
+ # Segmentation
54
+ results = []
55
+ texts.each do |text, index|
56
+ segments, mask = segment(text, lang, segmentByNewline)
57
+ results.push({
58
+ "text": text,
59
+ "segments": segments,
60
+ "mask": mask
61
+ })
62
+ end
63
+
64
+ puts results.to_json
65
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: pragmatic_segmenter_server
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Laurent Bié
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-01-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: sinatra
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 2.0.3
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 2.0.3
27
+ - !ruby/object:Gem::Dependency
28
+ name: sinatra-contrib
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 2.0.3
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 2.0.3
41
+ - !ruby/object:Gem::Dependency
42
+ name: json-schema
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '='
46
+ - !ruby/object:Gem::Version
47
+ version: 2.8.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '='
53
+ - !ruby/object:Gem::Version
54
+ version: 2.8.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: pragmatic_segmenter
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '='
60
+ - !ruby/object:Gem::Version
61
+ version: 0.3.18
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '='
67
+ - !ruby/object:Gem::Version
68
+ version: 0.3.18
69
+ description: A HTTP server for pragmatic segmenter
70
+ email: l.bie@pangeanic.com
71
+ executables: []
72
+ extensions: []
73
+ extra_rdoc_files: []
74
+ files:
75
+ - lib/pragmatic_segmenter_server.rb
76
+ - lib/pragmatic_segmenter_server/segment.rb
77
+ homepage: ''
78
+ licenses:
79
+ - MIT
80
+ metadata: {}
81
+ post_install_message:
82
+ rdoc_options: []
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ required_rubygems_version: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0'
95
+ requirements: []
96
+ rubyforge_project:
97
+ rubygems_version: 2.5.2.1
98
+ signing_key:
99
+ specification_version: 4
100
+ summary: A server for pragmatic segmenter
101
+ test_files: []