chunkify 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -1
- data/lib/chunkify/version.rb +1 -1
- data/lib/chunkify.rb +44 -24
- metadata +15 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bd15841c4753a975e1259857c96c524ec5e0c435d9adca98edf59a93cb9ca045
|
|
4
|
+
data.tar.gz: 3f06158f83273718b61c4a87809b53490acd209ff9e2c0e12593526cde195f1c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e78c1a25870032ee80ba74d7e218743ccdbde6af6e64e5dcc4c12befceaa1f8d997adf77fa8be9529c6c885988a07f1504a5f38b9a8d45fde2611b7990f0c251
|
|
7
|
+
data.tar.gz: 4b778ecd6c56a9b413ef813f343bbba8dc18d9e051d62229339b7c10f5903e1b27ead3bb73ad7e3798b6125590503e56f3203aa4d62a9e42420dfdcfa01eabdd
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
chunkify (0.1.
|
|
4
|
+
chunkify (0.1.4)
|
|
5
5
|
pry
|
|
6
6
|
|
|
7
7
|
GEM
|
|
@@ -13,6 +13,7 @@ GEM
|
|
|
13
13
|
coderay (~> 1.1)
|
|
14
14
|
method_source (~> 1.0)
|
|
15
15
|
rake (13.3.1)
|
|
16
|
+
tokenizer (0.3.0)
|
|
16
17
|
yard (0.9.38)
|
|
17
18
|
|
|
18
19
|
PLATFORMS
|
|
@@ -21,6 +22,7 @@ PLATFORMS
|
|
|
21
22
|
DEPENDENCIES
|
|
22
23
|
chunkify!
|
|
23
24
|
rake (~> 13.0)
|
|
25
|
+
tokenizer
|
|
24
26
|
yard (~> 0.9.38)
|
|
25
27
|
|
|
26
28
|
BUNDLED WITH
|
data/lib/chunkify/version.rb
CHANGED
data/lib/chunkify.rb
CHANGED
|
@@ -1,39 +1,59 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'tokenizer'
|
|
3
4
|
require_relative "chunkify/version"
|
|
4
5
|
|
|
5
6
|
module Chunkify
|
|
6
7
|
class Error < StandardError; end
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def self.size! input
|
|
11
|
-
@@SIZE = 2 ** input.split(" ").length
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
def self.size= s
|
|
15
|
-
@@SIZE = s
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
def self.size
|
|
19
|
-
@@SIZE
|
|
8
|
+
def self.tokens text
|
|
9
|
+
Tokenizer::WhitespaceTokenizer.new(:en).tokenize(text)
|
|
20
10
|
end
|
|
21
|
-
|
|
22
|
-
def self.split text, &b
|
|
23
|
-
if !block_given?
|
|
24
|
-
b = lambda { |e| e }
|
|
25
|
-
end
|
|
11
|
+
def self.split text, size: 1024
|
|
26
12
|
chunk = []
|
|
27
|
-
paragraphs = text.gsub(/\r/, "").gsub(/\n\n+/,"\n\n").split(/\n\n+/)
|
|
13
|
+
paragraphs = text.gsub(/\r/, "").gsub(/\n\n+/,"\n\n").split(/\n\n+/).map { |e| e.split("\n").map { |ee| ee.strip.gsub(/ +/, " ") }.join("\n") }
|
|
28
14
|
current_chunk = ""
|
|
29
15
|
paragraphs.each do |para|
|
|
30
|
-
if
|
|
31
|
-
|
|
32
|
-
current_chunk = para.split("\n").map { |e| e.strip.gsub(/ +/, " ") }.join("\n")
|
|
16
|
+
if /Project Gutenberg/.match(para) || /gutenberg.org/.match(para)
|
|
17
|
+
next
|
|
33
18
|
else
|
|
34
|
-
|
|
19
|
+
if current_chunk.length + para.length > size && !current_chunk.empty?
|
|
20
|
+
# premptive newline
|
|
21
|
+
chunk << current_chunk
|
|
22
|
+
# set next chunk
|
|
23
|
+
current_chunk = para
|
|
24
|
+
else
|
|
25
|
+
# add normally
|
|
26
|
+
current_chunk += (current_chunk.empty? ? "" : "\n\n") + para
|
|
27
|
+
end
|
|
35
28
|
end
|
|
36
29
|
end
|
|
37
|
-
chunk <<
|
|
30
|
+
chunk << current_chunk unless current_chunk.empty?
|
|
31
|
+
end
|
|
32
|
+
@@DOC = Hash.new { |h,k| h[k] = Doc.new(k) }
|
|
33
|
+
class Doc
|
|
34
|
+
attr_accessor :doc
|
|
35
|
+
def initialize k
|
|
36
|
+
@id = k
|
|
37
|
+
end
|
|
38
|
+
# chunk document (... and handle)
|
|
39
|
+
def chunk &b
|
|
40
|
+
if block_given?
|
|
41
|
+
Chunkify.split(@doc).map { |e| b.call(e) }
|
|
42
|
+
else
|
|
43
|
+
Chunkify.split(@doc)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
# chunk document to tokens (... and handle)
|
|
47
|
+
def tokens &b
|
|
48
|
+
if block_given?
|
|
49
|
+
chunk { |e| b.call(Chunkify.tokens(e)) }
|
|
50
|
+
else
|
|
51
|
+
chunk.map { |e| Chunkify.tokens(e) }
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
# document
|
|
56
|
+
def self.[] k
|
|
57
|
+
@@DOC[k]
|
|
38
58
|
end
|
|
39
59
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: chunkify
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Erik Olson
|
|
@@ -38,6 +38,20 @@ dependencies:
|
|
|
38
38
|
- - ">="
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: tokenizer
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
41
55
|
description:
|
|
42
56
|
email:
|
|
43
57
|
- xorgnak@xorgnak.com
|