slaw 0.14.2 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/lib/slaw/extract/extractor.rb +19 -3
- data/lib/slaw/extract/html_to_akn_text.xsl +115 -0
- data/lib/slaw/extract/yomu_patch.rb +1 -1
- data/lib/slaw/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 96b892b9bbe4ff26ec5609579851b092bb32aa9b
|
4
|
+
data.tar.gz: fe30f6724d2c2bb282d3df0f173a749451f3e291
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 977c23bc3650313e40d7fa7358dd848d51011da6329f3b4d905f388b6c79bfbbcf488b27ddd68b5f821365e3a1b1cfebc39f78d52c05257579f484ab2985714c
|
7
|
+
data.tar.gz: '0528d773d487b6cde6e5b67c3a4aa69f89c389e2ec62136c0495ec7df9f5278fba5023f9aac33b7bb542a5f795989b163f3d109fd558e484fed2d5848fd7437e'
|
data/README.md
CHANGED
@@ -218,6 +218,10 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
|
|
218
218
|
|
219
219
|
## Changelog
|
220
220
|
|
221
|
+
### 0.15.0
|
222
|
+
|
223
|
+
* Support tables in many non-PDF documents (eg. Word documents) by converting to HTML and then to Akoma Ntoso
|
224
|
+
|
221
225
|
### 0.14.2
|
222
226
|
|
223
227
|
* Convert non-breaking space (\xA0) to space
|
@@ -28,6 +28,8 @@ module Slaw
|
|
28
28
|
case mimetype && mimetype.type
|
29
29
|
when 'application/pdf'
|
30
30
|
extract_from_pdf(filename)
|
31
|
+
when 'text/html', nil
|
32
|
+
extract_from_html(filename)
|
31
33
|
when 'text/plain', nil
|
32
34
|
extract_from_text(filename)
|
33
35
|
else
|
@@ -78,6 +80,10 @@ module Slaw
|
|
78
80
|
File.read(filename)
|
79
81
|
end
|
80
82
|
|
83
|
+
def extract_from_html(filename)
|
84
|
+
html_to_text(File.read(filename))
|
85
|
+
end
|
86
|
+
|
81
87
|
# Extract text from +filename+ by sending it to apache tika
|
82
88
|
# http://tika.apache.org/
|
83
89
|
def extract_via_tika(filename)
|
@@ -87,9 +93,19 @@ module Slaw
|
|
87
93
|
require 'slaw/extract/yomu_patch'
|
88
94
|
logger.info("Using Tika to get text from #{filename}. You need a JVM installed for this.")
|
89
95
|
|
90
|
-
|
91
|
-
logger.info("Tika returned #{
|
92
|
-
text
|
96
|
+
html = Yomu.text_from_file(filename)
|
97
|
+
logger.info("Tika returned #{html.length} bytes")
|
98
|
+
# transform html into text
|
99
|
+
html_to_text(html)
|
100
|
+
end
|
101
|
+
|
102
|
+
def html_to_text(html)
|
103
|
+
here = File.dirname(__FILE__)
|
104
|
+
xslt = Nokogiri::XSLT(File.open(File.join([here, 'html_to_akn_text.xsl'])))
|
105
|
+
|
106
|
+
text = xslt.transform(Nokogiri::HTML(html)).to_s
|
107
|
+
# remove XML encoding at top
|
108
|
+
text.sub(/^<\?xml [^>]*>/, '')
|
93
109
|
end
|
94
110
|
|
95
111
|
def remove_pdf_password(filename)
|
@@ -0,0 +1,115 @@
|
|
1
|
+
<?xml version="1.0"?>
|
2
|
+
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
|
3
|
+
xmlns="http://www.w3.org/1999/xhtml">
|
4
|
+
|
5
|
+
<xsl:output method="text" indent="no" omit-xml-declaration="yes" encoding="utf-8" />
|
6
|
+
<xsl:strip-space elements="*"/>
|
7
|
+
|
8
|
+
<xsl:template match="html">
|
9
|
+
<xsl:apply-templates/>
|
10
|
+
</xsl:template>
|
11
|
+
|
12
|
+
<xsl:template match="head|style|script|link" />
|
13
|
+
|
14
|
+
<xsl:template match="p|div">
|
15
|
+
<xsl:choose>
|
16
|
+
<xsl:when test="starts-with(., '[') and substring(., string-length(.)) = ']'">
|
17
|
+
<!-- block elems that are wrapped in [ and ] are probably remarks -->
|
18
|
+
<xsl:text>[</xsl:text><xsl:apply-templates /><xsl:text>]</xsl:text>
|
19
|
+
</xsl:when>
|
20
|
+
<xsl:otherwise>
|
21
|
+
<xsl:apply-templates />
|
22
|
+
</xsl:otherwise>
|
23
|
+
</xsl:choose>
|
24
|
+
<!-- p and div tags must end with a newline -->
|
25
|
+
<xsl:text>
|
26
|
+
</xsl:text>
|
27
|
+
</xsl:template>
|
28
|
+
|
29
|
+
<xsl:template match="table">
|
30
|
+
<xsl:text>{| </xsl:text>
|
31
|
+
|
32
|
+
<!-- attributes -->
|
33
|
+
<xsl:for-each select="@*[local-name()!='id']">
|
34
|
+
<xsl:value-of select="local-name(.)" />
|
35
|
+
<xsl:text>="</xsl:text>
|
36
|
+
<xsl:value-of select="." />
|
37
|
+
<xsl:text>" </xsl:text>
|
38
|
+
</xsl:for-each>
|
39
|
+
<xsl:text>
|
40
|
+
|-</xsl:text>
|
41
|
+
|
42
|
+
<xsl:apply-templates />
|
43
|
+
<xsl:text>
|
44
|
+
|}
|
45
|
+
|
46
|
+
</xsl:text>
|
47
|
+
</xsl:template>
|
48
|
+
|
49
|
+
<xsl:template match="tr">
|
50
|
+
<xsl:apply-templates />
|
51
|
+
<xsl:text>
|
52
|
+
|-</xsl:text>
|
53
|
+
</xsl:template>
|
54
|
+
|
55
|
+
<xsl:template match="th|td">
|
56
|
+
<xsl:choose>
|
57
|
+
<xsl:when test="local-name(.) = 'th'">
|
58
|
+
<xsl:text>
|
59
|
+
! </xsl:text>
|
60
|
+
</xsl:when>
|
61
|
+
<xsl:when test="local-name(.) = 'td'">
|
62
|
+
<xsl:text>
|
63
|
+
| </xsl:text>
|
64
|
+
</xsl:when>
|
65
|
+
</xsl:choose>
|
66
|
+
|
67
|
+
<!-- attributes -->
|
68
|
+
<xsl:if test="@*">
|
69
|
+
<xsl:for-each select="@*">
|
70
|
+
<xsl:value-of select="local-name(.)" />
|
71
|
+
<xsl:text>="</xsl:text>
|
72
|
+
<xsl:value-of select="." />
|
73
|
+
<xsl:text>" </xsl:text>
|
74
|
+
</xsl:for-each>
|
75
|
+
<xsl:text>| </xsl:text>
|
76
|
+
</xsl:if>
|
77
|
+
|
78
|
+
<xsl:apply-templates />
|
79
|
+
</xsl:template>
|
80
|
+
|
81
|
+
<!-- don't end p tags with newlines in tables -->
|
82
|
+
<xsl:template match="table//p">
|
83
|
+
<xsl:apply-templates />
|
84
|
+
</xsl:template>
|
85
|
+
|
86
|
+
<!-- END tables -->
|
87
|
+
|
88
|
+
<xsl:template match="a[href]">
|
89
|
+
<xsl:text>[</xsl:text>
|
90
|
+
<xsl:apply-templates />
|
91
|
+
<xsl:text>](</xsl:text>
|
92
|
+
<xsl:value-of select="@href" />
|
93
|
+
<xsl:text>)</xsl:text>
|
94
|
+
</xsl:template>
|
95
|
+
|
96
|
+
<xsl:template match="img">
|
97
|
+
<xsl:text></xsl:text>
|
102
|
+
</xsl:template>
|
103
|
+
|
104
|
+
<xsl:template match="br">
|
105
|
+
<xsl:text>
|
106
|
+
</xsl:text>
|
107
|
+
</xsl:template>
|
108
|
+
|
109
|
+
|
110
|
+
<!-- for most nodes, just dump their text content -->
|
111
|
+
<xsl:template match="*">
|
112
|
+
<xsl:text/><xsl:apply-templates /><xsl:text/>
|
113
|
+
</xsl:template>
|
114
|
+
|
115
|
+
</xsl:stylesheet>
|
@@ -2,7 +2,7 @@ require 'yomu'
|
|
2
2
|
|
3
3
|
class Yomu
|
4
4
|
def self.text_from_file(filename)
|
5
|
-
IO.popen("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH}
|
5
|
+
IO.popen("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --html '#{filename}'", 'r') do |io|
|
6
6
|
io.read
|
7
7
|
end
|
8
8
|
end
|
data/lib/slaw/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-12-
|
11
|
+
date: 2017-12-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -185,6 +185,7 @@ files:
|
|
185
185
|
- lib/slaw/bylaw.rb
|
186
186
|
- lib/slaw/collection.rb
|
187
187
|
- lib/slaw/extract/extractor.rb
|
188
|
+
- lib/slaw/extract/html_to_akn_text.xsl
|
188
189
|
- lib/slaw/extract/yomu_patch.rb
|
189
190
|
- lib/slaw/generator.rb
|
190
191
|
- lib/slaw/lifecycle_event.rb
|