act_as_page_extractor 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -1
- data/Gemfile.lock +12 -15
- data/README.md +80 -71
- data/act_as_page_extractor.gemspec +6 -6
- data/docs/publishing.md +14 -0
- data/lib/act_as_page_extractor/modules/tools.rb +4 -4
- data/lib/act_as_page_extractor/version.rb +1 -1
- data/spec/spec_helper.rb +6 -2
- metadata +21 -20
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d0bd64f8e12d0c7bb3a75893738e30af616e4bcc5b958b18853b35363823b5ef
|
4
|
+
data.tar.gz: d87505b025bd924a545e2f6cbd9071958d65f993a234092f65b3cb7e108b16b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af0708407f3b4546424e1666926c248cdb9fe0813ede2dd642d099836282d2f608d8d47edcd5cd513cef9b3ead231c192f6b815ec7721eb141b6820f561d0f30
|
7
|
+
data.tar.gz: 6a5969118ff6a6141aaaec8989e38670f75a817afa230636ed84f9f2a4e7c1f160ee569664f940e1355cfc74dff56e30370ca993fbb38d2a7139c17f56858acf
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
act_as_page_extractor (0.
|
5
|
-
activerecord (~> 6
|
4
|
+
act_as_page_extractor (0.6.1)
|
5
|
+
activerecord (~> 6)
|
6
6
|
awesome_print (~> 1)
|
7
7
|
docsplit (~> 0)
|
8
8
|
filesize (~> 0)
|
9
|
-
pdf-reader (~> 1
|
9
|
+
pdf-reader (~> 1, >= 1.4)
|
10
10
|
pdf_utils (~> 0)
|
11
|
-
prawn (~>
|
11
|
+
prawn (~> 1)
|
12
12
|
total_compressor (~> 0)
|
13
13
|
|
14
14
|
GEM
|
@@ -38,6 +38,7 @@ GEM
|
|
38
38
|
i18n (1.14.5)
|
39
39
|
concurrent-ruby (~> 1.0)
|
40
40
|
minitest (5.25.1)
|
41
|
+
pdf-core (0.4.0)
|
41
42
|
pdf-reader (1.4.1)
|
42
43
|
Ascii85 (~> 1.0.0)
|
43
44
|
afm (~> 0.2.1)
|
@@ -45,13 +46,9 @@ GEM
|
|
45
46
|
ruby-rc4
|
46
47
|
ttfunk
|
47
48
|
pdf_utils (0.1.0)
|
48
|
-
prawn (
|
49
|
-
|
50
|
-
|
51
|
-
prawn-security (>= 0.7.1, < 0.8)
|
52
|
-
prawn-core (0.7.2)
|
53
|
-
prawn-layout (0.7.2)
|
54
|
-
prawn-security (0.7.1)
|
49
|
+
prawn (1.3.0)
|
50
|
+
pdf-core (~> 0.4.0)
|
51
|
+
ttfunk (~> 1.4.0)
|
55
52
|
rake (12.3.3)
|
56
53
|
rspec (3.13.0)
|
57
54
|
rspec-core (~> 3.13.0)
|
@@ -77,7 +74,7 @@ GEM
|
|
77
74
|
total_compressor (0.1.11)
|
78
75
|
awesome_print (~> 1.1, >= 1.1.0)
|
79
76
|
rubyzip (~> 1.2, >= 1.2.2)
|
80
|
-
ttfunk (1.
|
77
|
+
ttfunk (1.4.0)
|
81
78
|
tzinfo (2.0.6)
|
82
79
|
concurrent-ruby (~> 1.0)
|
83
80
|
zeitwerk (2.6.17)
|
@@ -87,16 +84,16 @@ PLATFORMS
|
|
87
84
|
|
88
85
|
DEPENDENCIES
|
89
86
|
act_as_page_extractor!
|
90
|
-
activerecord (~> 6
|
87
|
+
activerecord (~> 6)
|
91
88
|
awesome_print
|
92
|
-
bundler (~> 1
|
89
|
+
bundler (~> 1)
|
93
90
|
byebug
|
94
91
|
docsplit
|
95
92
|
filesize
|
96
93
|
pdf-reader
|
97
94
|
pdf_utils
|
98
95
|
prawn
|
99
|
-
rake (~> 12
|
96
|
+
rake (~> 12, >= 12.3.3)
|
100
97
|
rspec
|
101
98
|
simplecov
|
102
99
|
total_compressor
|
data/README.md
CHANGED
@@ -7,102 +7,111 @@ Library for extracting plain text from documents(files) for further processing (
|
|
7
7
|
|
8
8
|
Install appropriate tools before using:
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
```sh
|
11
|
+
sudo apt-get install zlib1g zlib1g-dev zip rar p7zip-full
|
12
|
+
```
|
12
13
|
Add this line to your application's Gemfile:
|
13
14
|
|
14
|
-
|
15
|
-
|
15
|
+
```rb
|
16
|
+
gem 'act_as_page_extractor'
|
17
|
+
bundle
|
18
|
+
```
|
16
19
|
## Usage
|
17
20
|
|
18
|
-
For example, for model Document we need
|
21
|
+
For example, for model Document in the Rails framework we need run:
|
19
22
|
|
20
|
-
|
21
|
-
|
23
|
+
```sh
|
24
|
+
rails g act_as_page_extractor:migration Document category_id user_id
|
25
|
+
```
|
22
26
|
|
23
27
|
As a result we get two migration files:
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
end
|
45
|
-
|
46
|
-
add_index :extracted_pages, :document_id
|
47
|
-
add_index :extracted_pages, :category_id
|
48
|
-
add_index :extracted_pages, [:document_id, :category_id]
|
49
|
-
add_index :extracted_pages, [:document_id, :page_number]
|
50
|
-
end
|
28
|
+
```rb
|
29
|
+
class AddPageExtractorFields < ActiveRecord::Migration
|
30
|
+
def change
|
31
|
+
add_column :documents, :page_extraction_state, :string, default: ''
|
32
|
+
add_column :documents, :page_extraction_pages, :integer, default: 0
|
33
|
+
add_column :documents, :page_extraction_doctype, :string, default: ''
|
34
|
+
add_column :documents, :page_extraction_filesize, :string, default: ''
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class CreateExtractedPages < ActiveRecord::Migration
|
39
|
+
def change
|
40
|
+
create_table :extracted_pages do |t|
|
41
|
+
t.text :page
|
42
|
+
t.integer :document_id
|
43
|
+
t.integer :category_id
|
44
|
+
t.integer :user_id
|
45
|
+
t.integer :page_number
|
46
|
+
|
47
|
+
t.timestamps null: false
|
51
48
|
end
|
52
49
|
|
50
|
+
add_index :extracted_pages, :document_id
|
51
|
+
add_index :extracted_pages, :category_id
|
52
|
+
add_index :extracted_pages, [:document_id, :category_id]
|
53
|
+
add_index :extracted_pages, [:document_id, :page_number]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
```
|
53
57
|
|
54
|
-
Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://
|
58
|
+
Model Document must have field which contains path to file(supports [different archive types](https://github.com/phlowerteam/total_compressor) that contains [txt, pdf, doc/x, txt, html, rtf, ...](https://docs-old.exoplatform.org/public/index.jsp?topic=%2FPLF41%2FPLFAdminGuide.Configuration.JODConverter.html))
|
55
59
|
|
56
60
|
Add to model next parameters for initializing:
|
57
61
|
|
58
|
-
|
59
|
-
|
62
|
+
```rb
|
63
|
+
class Document < ActiveRecord::Base
|
64
|
+
include ActAsPageExtractor
|
60
65
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
66
|
+
act_as_page_extractor options: {
|
67
|
+
document_class: 'Document',
|
68
|
+
save_as_pdf: true,
|
69
|
+
filename: :filename,
|
70
|
+
document_id: :document_id,
|
71
|
+
additional_fields: [:category_id, :user_id],
|
72
|
+
#file_storage: "/full/path/to/documents/storage",
|
73
|
+
#pdf_storage: "/full/path/to/extracted/pdf/storage"
|
74
|
+
}
|
70
75
|
|
71
|
-
|
72
|
-
|
76
|
+
has_many :extracted_pages, dependent: :destroy
|
77
|
+
end
|
78
|
+
```
|
73
79
|
|
74
80
|
Now our instance has few new methods:
|
75
81
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
82
|
+
```rb
|
83
|
+
document = Document.first
|
84
|
+
document.page_extract!
|
85
|
+
document.extracted_pages
|
86
|
+
document.pdf_path # if option 'save_as_pdf' is 'true'
|
80
87
|
|
81
|
-
|
82
|
-
|
88
|
+
# Access to pages
|
89
|
+
ExtractedPage.count
|
83
90
|
|
84
|
-
|
85
|
-
|
91
|
+
# Importing whole directory of documents
|
92
|
+
ActAsPageExtractor.import_files('/path/to/foler/with/documents')
|
86
93
|
|
87
|
-
|
88
|
-
|
94
|
+
# We can use cron for run the processing of all the new documents
|
95
|
+
ActAsPageExtractor.start_extraction
|
89
96
|
|
90
|
-
|
91
|
-
|
97
|
+
# Getting statistics information of all documents
|
98
|
+
ActAsPageExtractor.statistics
|
99
|
+
```
|
92
100
|
|
93
|
-
Parameters of initializing
|
101
|
+
Parameters of initializing **act_as_page_extractor**:
|
94
102
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
103
|
+
* **document_class** - name of model (e.g. Document)
|
104
|
+
* **save_as_pdf** - boolean [true, false] when we want save temporary pdf
|
105
|
+
* **filename** - name of field which contains access to the file and it should be an object with 'url' method that returns path to file (e.g. CarrierWave object with 'filename.url')
|
106
|
+
* **document_id** - name for saving id
|
107
|
+
* **additional_fields** - additional fields that added to extracted page (e.g. for indexing, etc.)
|
108
|
+
* **file_storage** - path for saving tmp files (by default it is "public")
|
109
|
+
* **pdf_storage** - path for saving pdf (by default it is "public/uploads/extracted/pdf")
|
102
110
|
|
103
111
|
## Run tests
|
104
|
-
|
105
|
-
|
112
|
+
```sh
|
113
|
+
rspec
|
114
|
+
```
|
106
115
|
## Contributing
|
107
116
|
1. Fork it
|
108
117
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
@@ -115,5 +124,5 @@ https://github.com/phlowerteam
|
|
115
124
|
phlowerteam@gmail.com
|
116
125
|
|
117
126
|
## License
|
118
|
-
Copyright (c)
|
127
|
+
Copyright (c) 2024 PhlowerTeam
|
119
128
|
MIT License
|
@@ -10,25 +10,25 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ['phlowerteam@gmail.com']
|
11
11
|
spec.description = %q{Library (Docsplit wrapper) for text extraction from pdf, doc/x, txt files with OpenOffice}
|
12
12
|
spec.summary = %q{Uses system calls}
|
13
|
-
spec.homepage = 'https://github.com/phlowerteam'
|
13
|
+
spec.homepage = 'https://github.com/phlowerteam/act_as_page_extractor'
|
14
14
|
spec.license = 'MIT'
|
15
15
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
17
17
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
18
|
spec.require_paths = ['lib']
|
19
19
|
|
20
|
-
spec.add_development_dependency 'bundler', '~> 1
|
21
|
-
spec.add_development_dependency 'rake', '~> 12
|
20
|
+
spec.add_development_dependency 'bundler', '~> 1'
|
21
|
+
spec.add_development_dependency 'rake', '~> 12', '>= 12.3.3'
|
22
22
|
spec.add_development_dependency 'byebug', '~> 0'
|
23
23
|
spec.add_development_dependency 'rspec', '~> 0'
|
24
24
|
spec.add_development_dependency 'simplecov', '~> 0'
|
25
25
|
|
26
|
-
spec.add_runtime_dependency 'activerecord', '~> 6
|
26
|
+
spec.add_runtime_dependency 'activerecord', '~> 6'
|
27
27
|
spec.add_runtime_dependency 'awesome_print', '~> 1'
|
28
28
|
spec.add_runtime_dependency 'docsplit', '~> 0' # API for OpenOffice jodconverter (any to pdf)
|
29
29
|
spec.add_runtime_dependency 'pdf_utils', '~> 0' # getting text from pdf
|
30
|
-
spec.add_runtime_dependency 'prawn', '~>
|
31
|
-
spec.add_runtime_dependency 'pdf-reader', '~> 1
|
30
|
+
spec.add_runtime_dependency 'prawn', '~> 1' # need for pdf_utils
|
31
|
+
spec.add_runtime_dependency 'pdf-reader', '~> 1', '>= 1.4' # need for pdf_utils
|
32
32
|
spec.add_runtime_dependency 'total_compressor', '~> 0' # decompressing files
|
33
33
|
spec.add_runtime_dependency 'filesize', '~> 0' # pretty size of file
|
34
34
|
end
|
data/docs/publishing.md
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# GEM publishing
|
2
|
+
|
3
|
+
```sh
|
4
|
+
# Add features or fix bugs
|
5
|
+
# Increase version number x.y.z
|
6
|
+
# lib/act_as_page_extractor/version.rb
|
7
|
+
bundle update
|
8
|
+
rspec
|
9
|
+
# git commit & git push
|
10
|
+
|
11
|
+
gem build act_as_page_extractor.gemspec
|
12
|
+
gem install ./act_as_page_extractor-x.y.z.gem
|
13
|
+
gem push act_as_page_extractor-x.y.z.gem
|
14
|
+
```
|
@@ -1,17 +1,17 @@
|
|
1
1
|
require 'timeout'
|
2
2
|
|
3
3
|
module ActAsPageExtractor
|
4
|
+
# :nocov:
|
4
5
|
def timeout_wrapper
|
5
6
|
result = nil
|
6
7
|
begin
|
7
8
|
result = Timeout::timeout(60*5) { yield }
|
8
9
|
rescue
|
9
|
-
# :nocov:
|
10
10
|
ensure
|
11
|
-
# :nocov:
|
12
11
|
result
|
13
12
|
end
|
14
13
|
end
|
14
|
+
# :nocov:
|
15
15
|
|
16
16
|
def is_extracted
|
17
17
|
@pdf_pages.to_i > 0 && self.extracted_pages.count == @pdf_pages
|
@@ -46,11 +46,11 @@ module ActAsPageExtractor
|
|
46
46
|
# ap "@copy_document_path"
|
47
47
|
# ap @copy_document_path
|
48
48
|
# ap "@document_path"
|
49
|
-
|
49
|
+
# ap @document_path
|
50
50
|
# ap "@pdf_path"
|
51
51
|
# ap @pdf_path
|
52
52
|
# ap "@pdf_pages"
|
53
|
-
|
53
|
+
# ap @pdf_pages
|
54
54
|
end
|
55
55
|
# :nocov:
|
56
56
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
|
-
|
1
|
+
unless ENV['SKIP_COVERAGE']
|
2
2
|
require 'simplecov'
|
3
|
-
SimpleCov.start 'rails'
|
3
|
+
SimpleCov.start 'rails' do
|
4
|
+
add_filter 'vendor'
|
5
|
+
end
|
6
|
+
SimpleCov.minimum_coverage 100
|
4
7
|
end
|
8
|
+
|
5
9
|
require 'rspec'
|
6
10
|
require 'support/models'
|
7
11
|
require 'act_as_page_extractor'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: act_as_page_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- PhlowerTeam
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,21 +16,21 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1
|
19
|
+
version: '1'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '1
|
26
|
+
version: '1'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '12
|
33
|
+
version: '12'
|
34
34
|
- - ">="
|
35
35
|
- !ruby/object:Gem::Version
|
36
36
|
version: 12.3.3
|
@@ -40,7 +40,7 @@ dependencies:
|
|
40
40
|
requirements:
|
41
41
|
- - "~>"
|
42
42
|
- !ruby/object:Gem::Version
|
43
|
-
version: '12
|
43
|
+
version: '12'
|
44
44
|
- - ">="
|
45
45
|
- !ruby/object:Gem::Version
|
46
46
|
version: 12.3.3
|
@@ -92,14 +92,14 @@ dependencies:
|
|
92
92
|
requirements:
|
93
93
|
- - "~>"
|
94
94
|
- !ruby/object:Gem::Version
|
95
|
-
version: '6
|
95
|
+
version: '6'
|
96
96
|
type: :runtime
|
97
97
|
prerelease: false
|
98
98
|
version_requirements: !ruby/object:Gem::Requirement
|
99
99
|
requirements:
|
100
100
|
- - "~>"
|
101
101
|
- !ruby/object:Gem::Version
|
102
|
-
version: '6
|
102
|
+
version: '6'
|
103
103
|
- !ruby/object:Gem::Dependency
|
104
104
|
name: awesome_print
|
105
105
|
requirement: !ruby/object:Gem::Requirement
|
@@ -148,34 +148,34 @@ dependencies:
|
|
148
148
|
requirements:
|
149
149
|
- - "~>"
|
150
150
|
- !ruby/object:Gem::Version
|
151
|
-
version:
|
151
|
+
version: '1'
|
152
152
|
type: :runtime
|
153
153
|
prerelease: false
|
154
154
|
version_requirements: !ruby/object:Gem::Requirement
|
155
155
|
requirements:
|
156
156
|
- - "~>"
|
157
157
|
- !ruby/object:Gem::Version
|
158
|
-
version:
|
158
|
+
version: '1'
|
159
159
|
- !ruby/object:Gem::Dependency
|
160
160
|
name: pdf-reader
|
161
161
|
requirement: !ruby/object:Gem::Requirement
|
162
162
|
requirements:
|
163
|
-
- - ">="
|
164
|
-
- !ruby/object:Gem::Version
|
165
|
-
version: 1.4.0
|
166
163
|
- - "~>"
|
167
164
|
- !ruby/object:Gem::Version
|
168
|
-
version: 1
|
165
|
+
version: '1'
|
166
|
+
- - ">="
|
167
|
+
- !ruby/object:Gem::Version
|
168
|
+
version: '1.4'
|
169
169
|
type: :runtime
|
170
170
|
prerelease: false
|
171
171
|
version_requirements: !ruby/object:Gem::Requirement
|
172
172
|
requirements:
|
173
|
-
- - ">="
|
174
|
-
- !ruby/object:Gem::Version
|
175
|
-
version: 1.4.0
|
176
173
|
- - "~>"
|
177
174
|
- !ruby/object:Gem::Version
|
178
|
-
version: 1
|
175
|
+
version: '1'
|
176
|
+
- - ">="
|
177
|
+
- !ruby/object:Gem::Version
|
178
|
+
version: '1.4'
|
179
179
|
- !ruby/object:Gem::Dependency
|
180
180
|
name: total_compressor
|
181
181
|
requirement: !ruby/object:Gem::Requirement
|
@@ -221,6 +221,7 @@ files:
|
|
221
221
|
- README.md
|
222
222
|
- Rakefile
|
223
223
|
- act_as_page_extractor.gemspec
|
224
|
+
- docs/publishing.md
|
224
225
|
- lib/act_as_page_extractor.rb
|
225
226
|
- lib/act_as_page_extractor/modules/extracting.rb
|
226
227
|
- lib/act_as_page_extractor/modules/interface.rb
|
@@ -248,7 +249,7 @@ files:
|
|
248
249
|
- test/test-doc-3-pages.rtf
|
249
250
|
- test/test-doc-3-pages.txt
|
250
251
|
- test/test-doc-3-pages.wrong
|
251
|
-
homepage: https://github.com/phlowerteam
|
252
|
+
homepage: https://github.com/phlowerteam/act_as_page_extractor
|
252
253
|
licenses:
|
253
254
|
- MIT
|
254
255
|
metadata: {}
|
@@ -267,7 +268,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
267
268
|
- !ruby/object:Gem::Version
|
268
269
|
version: '0'
|
269
270
|
requirements: []
|
270
|
-
rubygems_version: 3.
|
271
|
+
rubygems_version: 3.3.22
|
271
272
|
signing_key:
|
272
273
|
specification_version: 4
|
273
274
|
summary: Uses system calls
|