act_as_page_extractor 0.6.0 → 0.6.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Gemfile.lock +12 -15
- data/README.md +80 -71
- data/act_as_page_extractor.gemspec +6 -6
- data/docs/publishing.md +14 -0
- data/lib/act_as_page_extractor/modules/tools.rb +4 -4
- data/lib/act_as_page_extractor/version.rb +1 -1
- data/spec/spec_helper.rb +6 -2
- metadata +21 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d0bd64f8e12d0c7bb3a75893738e30af616e4bcc5b958b18853b35363823b5ef
|
4
|
+
data.tar.gz: d87505b025bd924a545e2f6cbd9071958d65f993a234092f65b3cb7e108b16b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af0708407f3b4546424e1666926c248cdb9fe0813ede2dd642d099836282d2f608d8d47edcd5cd513cef9b3ead231c192f6b815ec7721eb141b6820f561d0f30
|
7
|
+
data.tar.gz: 6a5969118ff6a6141aaaec8989e38670f75a817afa230636ed84f9f2a4e7c1f160ee569664f940e1355cfc74dff56e30370ca993fbb38d2a7139c17f56858acf
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
act_as_page_extractor (0.
|
5
|
-
activerecord (~> 6
|
4
|
+
act_as_page_extractor (0.6.1)
|
5
|
+
activerecord (~> 6)
|
6
6
|
awesome_print (~> 1)
|
7
7
|
docsplit (~> 0)
|
8
8
|
filesize (~> 0)
|
9
|
-
pdf-reader (~> 1
|
9
|
+
pdf-reader (~> 1, >= 1.4)
|
10
10
|
pdf_utils (~> 0)
|
11
|
-
prawn (~>
|
11
|
+
prawn (~> 1)
|
12
12
|
total_compressor (~> 0)
|
13
13
|
|
14
14
|
GEM
|
@@ -38,6 +38,7 @@ GEM
|
|
38
38
|
i18n (1.14.5)
|
39
39
|
concurrent-ruby (~> 1.0)
|
40
40
|
minitest (5.25.1)
|
41
|
+
pdf-core (0.4.0)
|
41
42
|
pdf-reader (1.4.1)
|
42
43
|
Ascii85 (~> 1.0.0)
|
43
44
|
afm (~> 0.2.1)
|
@@ -45,13 +46,9 @@ GEM
|
|
45
46
|
ruby-rc4
|
46
47
|
ttfunk
|
47
48
|
pdf_utils (0.1.0)
|
48
|
-
prawn (
|
49
|
-
|
50
|
-
|
51
|
-
prawn-security (>= 0.7.1, < 0.8)
|
52
|
-
prawn-core (0.7.2)
|
53
|
-
prawn-layout (0.7.2)
|
54
|
-
prawn-security (0.7.1)
|
49
|
+
prawn (1.3.0)
|
50
|
+
pdf-core (~> 0.4.0)
|
51
|
+
ttfunk (~> 1.4.0)
|
55
52
|
rake (12.3.3)
|
56
53
|
rspec (3.13.0)
|
57
54
|
rspec-core (~> 3.13.0)
|
@@ -77,7 +74,7 @@ GEM
|
|
77
74
|
total_compressor (0.1.11)
|
78
75
|
awesome_print (~> 1.1, >= 1.1.0)
|
79
76
|
rubyzip (~> 1.2, >= 1.2.2)
|
80
|
-
ttfunk (1.
|
77
|
+
ttfunk (1.4.0)
|
81
78
|
tzinfo (2.0.6)
|
82
79
|
concurrent-ruby (~> 1.0)
|
83
80
|
zeitwerk (2.6.17)
|
@@ -87,16 +84,16 @@ PLATFORMS
|
|
87
84
|
|
88
85
|
DEPENDENCIES
|
89
86
|
act_as_page_extractor!
|
90
|
-
activerecord (~> 6
|
87
|
+
activerecord (~> 6)
|
91
88
|
awesome_print
|
92
|
-
bundler (~> 1
|
89
|
+
bundler (~> 1)
|
93
90
|
byebug
|
94
91
|
docsplit
|
95
92
|
filesize
|
96
93
|
pdf-reader
|
97
94
|
pdf_utils
|
98
95
|
prawn
|
99
|
-
rake (~> 12
|
96
|
+
rake (~> 12, >= 12.3.3)
|
100
97
|
rspec
|
101
98
|
simplecov
|
102
99
|
total_compressor
|
data/README.md
CHANGED
@@ -7,102 +7,111 @@ Library for extracting plain text from documents(files) for further processing (
|
|
7
7
|
|
8
8
|
Install appropriate tools before using:
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
```sh
|
11
|
+
sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
|
12
|
+
```
|
12
13
|
Add this line to your application's Gemfile:
|
13
14
|
|
14
|
-
|
15
|
-
|
15
|
+
```rb
|
16
|
+
gem 'act_as_page_extractor'
|
17
|
+
bundle
|
18
|
+
```
|
16
19
|
## Usage
|
17
20
|
|
18
|
-
For example, for model Document we need
|
21
|
+
For example, for model Document in the Rails framework we need run:
|
19
22
|
|
20
|
-
|
21
|
-
|
23
|
+
```sh
|
24
|
+
rails g act_as_page_extractor:migration Document category_id user_id
|
25
|
+
```
|
22
26
|
|
23
27
|
As a result we get two migration files:
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
end
|
45
|
-
|
46
|
-
add_index :extracted_pages, :document_id
|
47
|
-
add_index :extracted_pages, :category_id
|
48
|
-
add_index :extracted_pages, [:document_id, :category_id]
|
49
|
-
add_index :extracted_pages, [:document_id, :page_number]
|
50
|
-
end
|
28
|
+
```rb
|
29
|
+
class AddPageExtractorFields < ActiveRecord::Migration
|
30
|
+
def change
|
31
|
+
add_column :documents, :page_extraction_state, :string, default: ''
|
32
|
+
add_column :documents, :page_extraction_pages, :integer, default: 0
|
33
|
+
add_column :documents, :page_extraction_doctype, :string, default: ''
|
34
|
+
add_column :documents, :page_extraction_filesize, :string, default: ''
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class CreateExtractedPages < ActiveRecord::Migration
|
39
|
+
def change
|
40
|
+
create_table :extracted_pages do |t|
|
41
|
+
t.text :page
|
42
|
+
t.integer :document_id
|
43
|
+
t.integer :category_id
|
44
|
+
t.integer :user_id
|
45
|
+
t.integer :page_number
|
46
|
+
|
47
|
+
t.timestamps null: false
|
51
48
|
end
|
52
49
|
|
50
|
+
add_index :extracted_pages, :document_id
|
51
|
+
add_index :extracted_pages, :category_id
|
52
|
+
add_index :extracted_pages, [:document_id, :category_id]
|
53
|
+
add_index :extracted_pages, [:document_id, :page_number]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
```
|
53
57
|
|
54
|
-
Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://
|
58
|
+
Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://docs-old.exoplatform.org/public/index.jsp?topic=%2FPLF41%2FPLFAdminGuide.Configuration.JODConverter.html))
|
55
59
|
|
56
60
|
Add to model next parameters for initializing:
|
57
61
|
|
58
|
-
|
59
|
-
|
62
|
+
```rb
|
63
|
+
class Document < ActiveRecord::Base
|
64
|
+
include ActAsPageExtractor
|
60
65
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
act_as_page_extractor options: {
|
67
|
+
document_class: 'Document',
|
68
|
+
save_as_pdf: true,
|
69
|
+
filename: :filename,
|
70
|
+
document_id: :document_id,
|
71
|
+
additional_fields: [:category_id, :user_id],
|
72
|
+
#file_storage: "/full/path/to/documents/storage",
|
73
|
+
#pdf_storage: "/full/path/to/extracted/pdf/storage"
|
74
|
+
}
|
70
75
|
|
71
|
-
|
72
|
-
|
76
|
+
has_many :extracted_pages, dependent: :destroy
|
77
|
+
end
|
78
|
+
```
|
73
79
|
|
74
80
|
Now our instance has few new methods:
|
75
81
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
82
|
+
```rb
|
83
|
+
document = Document.first
|
84
|
+
document.page_extract!
|
85
|
+
document.extracted_pages
|
86
|
+
document.pdf_path # if option 'save_as_pdf' is 'true'
|
80
87
|
|
81
|
-
|
82
|
-
|
88
|
+
# Access to pages
|
89
|
+
ExtractedPage.count
|
83
90
|
|
84
|
-
|
85
|
-
|
91
|
+
# Importing whole directory of documents
|
92
|
+
ActAsPageExtractor.import_files('/path/to/foler/with/documents')
|
86
93
|
|
87
|
-
|
88
|
-
|
94
|
+
# We can use cron for run the processing of all the new documents
|
95
|
+
ActAsPageExtractor.start_extraction
|
89
96
|
|
90
|
-
|
91
|
-
|
97
|
+
# Getting statistics information of all documents
|
98
|
+
ActAsPageExtractor.statistics
|
99
|
+
```
|
92
100
|
|
93
|
-
Parameters of initializing
|
101
|
+
Parameters of initializing **act_as_page_extractor**:
|
94
102
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
103
|
+
* **document_class** - name of model (e.g. Document)
|
104
|
+
* **save_as_pdf** - boolean [true, false] when we want save temporary pdf
|
105
|
+
* **filename** - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
|
106
|
+
* **document_id** - name for saving id
|
107
|
+
* **additional_fields** - additional fields that added to extracted page (e.g. for indexing, etc.)
|
108
|
+
* **file_storage** - path for saving tmp files (by default it is "public")
|
109
|
+
* **pdf_storage** - path for saving pdf (by default it is "public/uploads/extracted/pdf")
|
102
110
|
|
103
111
|
## Run tests
|
104
|
-
|
105
|
-
|
112
|
+
```sh
|
113
|
+
rspec
|
114
|
+
```
|
106
115
|
## Contributing
|
107
116
|
1. Fork it
|
108
117
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
@@ -115,5 +124,5 @@ https://github.com/phlowerteam
|
|
115
124
|
phlowerteam@gmail.com
|
116
125
|
|
117
126
|
## License
|
118
|
-
Copyright (c)
|
127
|
+
Copyright (c) 2024 PhlowerTeam
|
119
128
|
MIT License
|
@@ -10,25 +10,25 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ['phlowerteam@gmail.com']
|
11
11
|
spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
|
12
12
|
spec.summary = %q{Uses system calls}
|
13
|
-
spec.homepage = 'https://github.com/phlowerteam'
|
13
|
+
spec.homepage = 'https://github.com/phlowerteam/act_as_page_extractor'
|
14
14
|
spec.license = 'MIT'
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ['lib']
|
19
19
|
|
20
|
-
spec.add_development_dependency 'bundler', '~> 1
|
21
|
-
spec.add_development_dependency 'rake', '~> 12
|
20
|
+
spec.add_development_dependency 'bundler', '~> 1'
|
21
|
+
spec.add_development_dependency 'rake', '~> 12', '>= 12.3.3'
|
22
22
|
spec.add_development_dependency 'byebug', '~> 0'
|
23
23
|
spec.add_development_dependency 'rspec', '~> 0'
|
24
24
|
spec.add_development_dependency 'simplecov', '~> 0'
|
25
25
|
|
26
|
-
spec.add_runtime_dependency 'activerecord', '~> 6
|
26
|
+
spec.add_runtime_dependency 'activerecord', '~> 6'
|
27
27
|
spec.add_runtime_dependency 'awesome_print', '~> 1'
|
28
28
|
spec.add_runtime_dependency 'docsplit', '~> 0' # API for OpenOffice jodconverter (any to pdf)
|
29
29
|
spec.add_runtime_dependency 'pdf_utils', '~> 0' # getting text from pdf
|
30
|
-
spec.add_runtime_dependency 'prawn', '~>
|
31
|
-
spec.add_runtime_dependency 'pdf-reader', '~> 1
|
30
|
+
spec.add_runtime_dependency 'prawn', '~> 1' # need for pdf_utils
|
31
|
+
spec.add_runtime_dependency 'pdf-reader', '~> 1', '>= 1.4' # need for pdf_utils
|
32
32
|
spec.add_runtime_dependency 'total_compressor', '~> 0' # decompressing files
|
33
33
|
spec.add_runtime_dependency 'filesize', '~> 0' # pretty size of file
|
34
34
|
end
|
data/docs/publishing.md
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# GEM publishing
|
2
|
+
|
3
|
+
```sh
|
4
|
+
# Add features or fix bugs
|
5
|
+
# Increase version number x.y.z
|
6
|
+
# lib/act_as_page_extractor/version.rb
|
7
|
+
bundle update
|
8
|
+
rspec
|
9
|
+
# git commit & git push
|
10
|
+
|
11
|
+
gem build act_as_page_extractor.gemspec
|
12
|
+
gem install ./act_as_page_extractor-x.y.z.gem
|
13
|
+
gem push act_as_page_extractor-x.y.z.gem
|
14
|
+
```
|
@@ -1,17 +1,17 @@
|
|
1
1
|
require 'timeout'
|
2
2
|
|
3
3
|
module ActAsPageExtractor
|
4
|
+
# :nocov:
|
4
5
|
def timeout_wrapper
|
5
6
|
result = nil
|
6
7
|
begin
|
7
8
|
result = Timeout::timeout(60*5) { yield }
|
8
9
|
rescue
|
9
|
-
# :nocov:
|
10
10
|
ensure
|
11
|
-
# :nocov:
|
12
11
|
result
|
13
12
|
end
|
14
13
|
end
|
14
|
+
# :nocov:
|
15
15
|
|
16
16
|
def is_extracted
|
17
17
|
@pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
|
@@ -46,11 +46,11 @@ module ActAsPageExtractor
|
|
46
46
|
# ap "@copy_document_path"
|
47
47
|
# ap @copy_document_path
|
48
48
|
# ap "@document_path"
|
49
|
-
|
49
|
+
# ap @document_path
|
50
50
|
# ap "@pdf_path"
|
51
51
|
# ap @pdf_path
|
52
52
|
# ap "@pdf_pages"
|
53
|
-
|
53
|
+
# ap @pdf_pages
|
54
54
|
end
|
55
55
|
# :nocov:
|
56
56
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
|
-
|
1
|
+
unless ENV['SKIP_COVERAGE']
|
2
2
|
require 'simplecov'
|
3
|
-
SimpleCov.start 'rails'
|
3
|
+
SimpleCov.start 'rails' do
|
4
|
+
add_filter 'vendor'
|
5
|
+
end
|
6
|
+
SimpleCov.minimum_coverage 100
|
4
7
|
end
|
8
|
+
|
5
9
|
require 'rspec'
|
6
10
|
require 'support/models'
|
7
11
|
require 'act_as_page_extractor'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: act_as_page_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- PhlowerTeam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,21 +16,21 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1
|
19
|
+
version: '1'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '1
|
26
|
+
version: '1'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '12
|
33
|
+
version: '12'
|
34
34
|
- - ">="
|
35
35
|
- !ruby/object:Gem::Version
|
36
36
|
version: 12.3.3
|
@@ -40,7 +40,7 @@ dependencies:
|
|
40
40
|
requirements:
|
41
41
|
- - "~>"
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: '12
|
43
|
+
version: '12'
|
44
44
|
- - ">="
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: 12.3.3
|
@@ -92,14 +92,14 @@ dependencies:
|
|
92
92
|
requirements:
|
93
93
|
- - "~>"
|
94
94
|
- !ruby/object:Gem::Version
|
95
|
-
version: '6
|
95
|
+
version: '6'
|
96
96
|
type: :runtime
|
97
97
|
prerelease: false
|
98
98
|
version_requirements: !ruby/object:Gem::Requirement
|
99
99
|
requirements:
|
100
100
|
- - "~>"
|
101
101
|
- !ruby/object:Gem::Version
|
102
|
-
version: '6
|
102
|
+
version: '6'
|
103
103
|
- !ruby/object:Gem::Dependency
|
104
104
|
name: awesome_print
|
105
105
|
requirement: !ruby/object:Gem::Requirement
|
@@ -148,34 +148,34 @@ dependencies:
|
|
148
148
|
requirements:
|
149
149
|
- - "~>"
|
150
150
|
- !ruby/object:Gem::Version
|
151
|
-
version:
|
151
|
+
version: '1'
|
152
152
|
type: :runtime
|
153
153
|
prerelease: false
|
154
154
|
version_requirements: !ruby/object:Gem::Requirement
|
155
155
|
requirements:
|
156
156
|
- - "~>"
|
157
157
|
- !ruby/object:Gem::Version
|
158
|
-
version:
|
158
|
+
version: '1'
|
159
159
|
- !ruby/object:Gem::Dependency
|
160
160
|
name: pdf-reader
|
161
161
|
requirement: !ruby/object:Gem::Requirement
|
162
162
|
requirements:
|
163
|
-
- - ">="
|
164
|
-
- !ruby/object:Gem::Version
|
165
|
-
version: 1.4.0
|
166
163
|
- - "~>"
|
167
164
|
- !ruby/object:Gem::Version
|
168
|
-
version: 1
|
165
|
+
version: '1'
|
166
|
+
- - ">="
|
167
|
+
- !ruby/object:Gem::Version
|
168
|
+
version: '1.4'
|
169
169
|
type: :runtime
|
170
170
|
prerelease: false
|
171
171
|
version_requirements: !ruby/object:Gem::Requirement
|
172
172
|
requirements:
|
173
|
-
- - ">="
|
174
|
-
- !ruby/object:Gem::Version
|
175
|
-
version: 1.4.0
|
176
173
|
- - "~>"
|
177
174
|
- !ruby/object:Gem::Version
|
178
|
-
version: 1
|
175
|
+
version: '1'
|
176
|
+
- - ">="
|
177
|
+
- !ruby/object:Gem::Version
|
178
|
+
version: '1.4'
|
179
179
|
- !ruby/object:Gem::Dependency
|
180
180
|
name: total_compressor
|
181
181
|
requirement: !ruby/object:Gem::Requirement
|
@@ -221,6 +221,7 @@ files:
|
|
221
221
|
- README.md
|
222
222
|
- Rakefile
|
223
223
|
- act_as_page_extractor.gemspec
|
224
|
+
- docs/publishing.md
|
224
225
|
- lib/act_as_page_extractor.rb
|
225
226
|
- lib/act_as_page_extractor/modules/extracting.rb
|
226
227
|
- lib/act_as_page_extractor/modules/interface.rb
|
@@ -248,7 +249,7 @@ files:
|
|
248
249
|
- test/test-doc-3-pages.rtf
|
249
250
|
- test/test-doc-3-pages.txt
|
250
251
|
- test/test-doc-3-pages.wrong
|
251
|
-
homepage: https://github.com/phlowerteam
|
252
|
+
homepage: https://github.com/phlowerteam/act_as_page_extractor
|
252
253
|
licenses:
|
253
254
|
- MIT
|
254
255
|
metadata: {}
|
@@ -267,7 +268,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
267
268
|
- !ruby/object:Gem::Version
|
268
269
|
version: '0'
|
269
270
|
requirements: []
|
270
|
-
rubygems_version: 3.
|
271
|
+
rubygems_version: 3.3.22
|
271
272
|
signing_key:
|
272
273
|
specification_version: 4
|
273
274
|
summary: Uses system calls
|