slaw 0.14.2 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 00d35b04f6d4d01a206bf5425face038e0b86f56
4
- data.tar.gz: 514c2b482ffc969b32bdcd7e0d48cce44f63f979
3
+ metadata.gz: 96b892b9bbe4ff26ec5609579851b092bb32aa9b
4
+ data.tar.gz: fe30f6724d2c2bb282d3df0f173a749451f3e291
5
5
  SHA512:
6
- metadata.gz: f4e0f3eda7310ad7bc60ff57e19e7151f55d4736fb9beffabe560311c97a611fe870833593f96c7ac7829c1deec2e31f91e2d1c54729f8f1c8ced9fdcb369408
7
- data.tar.gz: d48951afdc9be70b291451356a1e560a4d10bf8ce03e06387ce6c87c8a635878bb3b57e8fd1a3f0df802005af7ad0366e1286c8623866ca5fef9f85f854b3ae1
6
+ metadata.gz: 977c23bc3650313e40d7fa7358dd848d51011da6329f3b4d905f388b6c79bfbbcf488b27ddd68b5f821365e3a1b1cfebc39f78d52c05257579f484ab2985714c
7
+ data.tar.gz: '0528d773d487b6cde6e5b67c3a4aa69f89c389e2ec62136c0495ec7df9f5278fba5023f9aac33b7bb542a5f795989b163f3d109fd558e484fed2d5848fd7437e'
data/README.md CHANGED
@@ -218,6 +218,10 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
218
218
 
219
219
  ## Changelog
220
220
 
221
+ ### 0.15.0
222
+
223
+ * Support tables in many non-PDF documents (eg. Word documents) by converting to HTML and then to Akoma Ntoso
224
+
221
225
  ### 0.14.2
222
226
 
223
227
  * Convert non-breaking space (\xA0) to space
@@ -28,6 +28,8 @@ module Slaw
28
28
  case mimetype && mimetype.type
29
29
  when 'application/pdf'
30
30
  extract_from_pdf(filename)
31
+ when 'text/html', nil
32
+ extract_from_html(filename)
31
33
  when 'text/plain', nil
32
34
  extract_from_text(filename)
33
35
  else
@@ -78,6 +80,10 @@ module Slaw
78
80
  File.read(filename)
79
81
  end
80
82
 
83
+ def extract_from_html(filename)
84
+ html_to_text(File.read(filename))
85
+ end
86
+
81
87
  # Extract text from +filename+ by sending it to apache tika
82
88
  # http://tika.apache.org/
83
89
  def extract_via_tika(filename)
@@ -87,9 +93,19 @@ module Slaw
87
93
  require 'slaw/extract/yomu_patch'
88
94
  logger.info("Using Tika to get text from #{filename}. You need a JVM installed for this.")
89
95
 
90
- text = Yomu.text_from_file(filename)
91
- logger.info("Tika returned #{text.length} bytes")
92
- text
96
+ html = Yomu.text_from_file(filename)
97
+ logger.info("Tika returned #{html.length} bytes")
98
+ # transform html into text
99
+ html_to_text(html)
100
+ end
101
+
102
+ def html_to_text(html)
103
+ here = File.dirname(__FILE__)
104
+ xslt = Nokogiri::XSLT(File.open(File.join([here, 'html_to_akn_text.xsl'])))
105
+
106
+ text = xslt.transform(Nokogiri::HTML(html)).to_s
107
+ # remove XML encoding at top
108
+ text.sub(/^<\?xml [^>]*>/, '')
93
109
  end
94
110
 
95
111
  def remove_pdf_password(filename)
@@ -0,0 +1,115 @@
1
+ <?xml version="1.0"?>
2
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
3
+ xmlns="http://www.w3.org/1999/xhtml">
4
+
5
+ <xsl:output method="text" indent="no" omit-xml-declaration="yes" encoding="utf-8" />
6
+ <xsl:strip-space elements="*"/>
7
+
8
+ <xsl:template match="html">
9
+ <xsl:apply-templates/>
10
+ </xsl:template>
11
+
12
+ <xsl:template match="head|style|script|link" />
13
+
14
+ <xsl:template match="p|div">
15
+ <xsl:choose>
16
+ <xsl:when test="starts-with(., '[') and substring(., string-length(.)) = ']'">
17
+ <!-- block elems that are wrapped in [ and ] are probably remarks -->
18
+ <xsl:text>[</xsl:text><xsl:apply-templates /><xsl:text>]</xsl:text>
19
+ </xsl:when>
20
+ <xsl:otherwise>
21
+ <xsl:apply-templates />
22
+ </xsl:otherwise>
23
+ </xsl:choose>
24
+ <!-- p and div tags must end with a newline -->
25
+ <xsl:text>
26
+ </xsl:text>
27
+ </xsl:template>
28
+
29
+ <xsl:template match="table">
30
+ <xsl:text>{| </xsl:text>
31
+
32
+ <!-- attributes -->
33
+ <xsl:for-each select="@*[local-name()!='id']">
34
+ <xsl:value-of select="local-name(.)" />
35
+ <xsl:text>="</xsl:text>
36
+ <xsl:value-of select="." />
37
+ <xsl:text>" </xsl:text>
38
+ </xsl:for-each>
39
+ <xsl:text>
40
+ |-</xsl:text>
41
+
42
+ <xsl:apply-templates />
43
+ <xsl:text>
44
+ |}
45
+
46
+ </xsl:text>
47
+ </xsl:template>
48
+
49
+ <xsl:template match="tr">
50
+ <xsl:apply-templates />
51
+ <xsl:text>
52
+ |-</xsl:text>
53
+ </xsl:template>
54
+
55
+ <xsl:template match="th|td">
56
+ <xsl:choose>
57
+ <xsl:when test="local-name(.) = 'th'">
58
+ <xsl:text>
59
+ ! </xsl:text>
60
+ </xsl:when>
61
+ <xsl:when test="local-name(.) = 'td'">
62
+ <xsl:text>
63
+ | </xsl:text>
64
+ </xsl:when>
65
+ </xsl:choose>
66
+
67
+ <!-- attributes -->
68
+ <xsl:if test="@*">
69
+ <xsl:for-each select="@*">
70
+ <xsl:value-of select="local-name(.)" />
71
+ <xsl:text>="</xsl:text>
72
+ <xsl:value-of select="." />
73
+ <xsl:text>" </xsl:text>
74
+ </xsl:for-each>
75
+ <xsl:text>| </xsl:text>
76
+ </xsl:if>
77
+
78
+ <xsl:apply-templates />
79
+ </xsl:template>
80
+
81
+ <!-- don't end p tags with newlines in tables -->
82
+ <xsl:template match="table//p">
83
+ <xsl:apply-templates />
84
+ </xsl:template>
85
+
86
+ <!-- END tables -->
87
+
88
+ <xsl:template match="a[href]">
89
+ <xsl:text>[</xsl:text>
90
+ <xsl:apply-templates />
91
+ <xsl:text>](</xsl:text>
92
+ <xsl:value-of select="@href" />
93
+ <xsl:text>)</xsl:text>
94
+ </xsl:template>
95
+
96
+ <xsl:template match="img">
97
+ <xsl:text>![</xsl:text>
98
+ <xsl:value-of select="@alt" />
99
+ <xsl:text>](</xsl:text>
100
+ <xsl:value-of select="@src" />
101
+ <xsl:text>)</xsl:text>
102
+ </xsl:template>
103
+
104
+ <xsl:template match="br">
105
+ <xsl:text>
106
+ </xsl:text>
107
+ </xsl:template>
108
+
109
+
110
+ <!-- for most nodes, just dump their text content -->
111
+ <xsl:template match="*">
112
+ <xsl:text/><xsl:apply-templates /><xsl:text/>
113
+ </xsl:template>
114
+
115
+ </xsl:stylesheet>
@@ -2,7 +2,7 @@ require 'yomu'
2
2
 
3
3
  class Yomu
4
4
  def self.text_from_file(filename)
5
- IO.popen("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} -t '#{filename}'", 'r') do |io|
5
+ IO.popen("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --html '#{filename}'", 'r') do |io|
6
6
  io.read
7
7
  end
8
8
  end
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "0.14.2"
2
+ VERSION = "0.15.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.2
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-11 00:00:00.000000000 Z
11
+ date: 2017-12-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -185,6 +185,7 @@ files:
185
185
  - lib/slaw/bylaw.rb
186
186
  - lib/slaw/collection.rb
187
187
  - lib/slaw/extract/extractor.rb
188
+ - lib/slaw/extract/html_to_akn_text.xsl
188
189
  - lib/slaw/extract/yomu_patch.rb
189
190
  - lib/slaw/generator.rb
190
191
  - lib/slaw/lifecycle_event.rb