slaw 0.14.2 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 00d35b04f6d4d01a206bf5425face038e0b86f56
4
- data.tar.gz: 514c2b482ffc969b32bdcd7e0d48cce44f63f979
3
+ metadata.gz: 96b892b9bbe4ff26ec5609579851b092bb32aa9b
4
+ data.tar.gz: fe30f6724d2c2bb282d3df0f173a749451f3e291
5
5
  SHA512:
6
- metadata.gz: f4e0f3eda7310ad7bc60ff57e19e7151f55d4736fb9beffabe560311c97a611fe870833593f96c7ac7829c1deec2e31f91e2d1c54729f8f1c8ced9fdcb369408
7
- data.tar.gz: d48951afdc9be70b291451356a1e560a4d10bf8ce03e06387ce6c87c8a635878bb3b57e8fd1a3f0df802005af7ad0366e1286c8623866ca5fef9f85f854b3ae1
6
+ metadata.gz: 977c23bc3650313e40d7fa7358dd848d51011da6329f3b4d905f388b6c79bfbbcf488b27ddd68b5f821365e3a1b1cfebc39f78d52c05257579f484ab2985714c
7
+ data.tar.gz: '0528d773d487b6cde6e5b67c3a4aa69f89c389e2ec62136c0495ec7df9f5278fba5023f9aac33b7bb542a5f795989b163f3d109fd558e484fed2d5848fd7437e'
data/README.md CHANGED
@@ -218,6 +218,10 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
218
218
 
219
219
  ## Changelog
220
220
 
221
+ ### 0.15.0
222
+
223
+ * Support tables in many non-PDF documents (eg. Word documents) by converting to HTML and then to Akoma Ntoso
224
+
221
225
  ### 0.14.2
222
226
 
223
227
  * Convert non-breaking space (\xA0) to space
@@ -28,6 +28,8 @@ module Slaw
28
28
  case mimetype && mimetype.type
29
29
  when 'application/pdf'
30
30
  extract_from_pdf(filename)
31
+ when 'text/html', nil
32
+ extract_from_html(filename)
31
33
  when 'text/plain', nil
32
34
  extract_from_text(filename)
33
35
  else
@@ -78,6 +80,10 @@ module Slaw
78
80
  File.read(filename)
79
81
  end
80
82
 
83
+ def extract_from_html(filename)
84
+ html_to_text(File.read(filename))
85
+ end
86
+
81
87
  # Extract text from +filename+ by sending it to apache tika
82
88
  # http://tika.apache.org/
83
89
  def extract_via_tika(filename)
@@ -87,9 +93,19 @@ module Slaw
87
93
  require 'slaw/extract/yomu_patch'
88
94
  logger.info("Using Tika to get text from #{filename}. You need a JVM installed for this.")
89
95
 
90
- text = Yomu.text_from_file(filename)
91
- logger.info("Tika returned #{text.length} bytes")
92
- text
96
+ html = Yomu.text_from_file(filename)
97
+ logger.info("Tika returned #{html.length} bytes")
98
+ # transform html into text
99
+ html_to_text(html)
100
+ end
101
+
102
+ def html_to_text(html)
103
+ here = File.dirname(__FILE__)
104
+ xslt = Nokogiri::XSLT(File.open(File.join([here, 'html_to_akn_text.xsl'])))
105
+
106
+ text = xslt.transform(Nokogiri::HTML(html)).to_s
107
+ # remove XML encoding at top
108
+ text.sub(/^<\?xml [^>]*>/, '')
93
109
  end
94
110
 
95
111
  def remove_pdf_password(filename)
@@ -0,0 +1,115 @@
1
+ <?xml version="1.0"?>
2
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"
3
+ xmlns="http://www.w3.org/1999/xhtml">
4
+
5
+ <xsl:output method="text" indent="no" omit-xml-declaration="yes" encoding="utf-8" />
6
+ <xsl:strip-space elements="*"/>
7
+
8
+ <xsl:template match="html">
9
+ <xsl:apply-templates/>
10
+ </xsl:template>
11
+
12
+ <xsl:template match="head|style|script|link" />
13
+
14
+ <xsl:template match="p|div">
15
+ <xsl:choose>
16
+ <xsl:when test="starts-with(., '[') and substring(., string-length(.)) = ']'">
17
+ <!-- block elems that are wrapped in [ and ] are probably remarks -->
18
+ <xsl:text>[</xsl:text><xsl:apply-templates /><xsl:text>]</xsl:text>
19
+ </xsl:when>
20
+ <xsl:otherwise>
21
+ <xsl:apply-templates />
22
+ </xsl:otherwise>
23
+ </xsl:choose>
24
+ <!-- p and div tags must end with a newline -->
25
+ <xsl:text>
26
+ </xsl:text>
27
+ </xsl:template>
28
+
29
+ <xsl:template match="table">
30
+ <xsl:text>{| </xsl:text>
31
+
32
+ <!-- attributes -->
33
+ <xsl:for-each select="@*[local-name()!='id']">
34
+ <xsl:value-of select="local-name(.)" />
35
+ <xsl:text>="</xsl:text>
36
+ <xsl:value-of select="." />
37
+ <xsl:text>" </xsl:text>
38
+ </xsl:for-each>
39
+ <xsl:text>
40
+ |-</xsl:text>
41
+
42
+ <xsl:apply-templates />
43
+ <xsl:text>
44
+ |}
45
+
46
+ </xsl:text>
47
+ </xsl:template>
48
+
49
+ <xsl:template match="tr">
50
+ <xsl:apply-templates />
51
+ <xsl:text>
52
+ |-</xsl:text>
53
+ </xsl:template>
54
+
55
+ <xsl:template match="th|td">
56
+ <xsl:choose>
57
+ <xsl:when test="local-name(.) = 'th'">
58
+ <xsl:text>
59
+ ! </xsl:text>
60
+ </xsl:when>
61
+ <xsl:when test="local-name(.) = 'td'">
62
+ <xsl:text>
63
+ | </xsl:text>
64
+ </xsl:when>
65
+ </xsl:choose>
66
+
67
+ <!-- attributes -->
68
+ <xsl:if test="@*">
69
+ <xsl:for-each select="@*">
70
+ <xsl:value-of select="local-name(.)" />
71
+ <xsl:text>="</xsl:text>
72
+ <xsl:value-of select="." />
73
+ <xsl:text>" </xsl:text>
74
+ </xsl:for-each>
75
+ <xsl:text>| </xsl:text>
76
+ </xsl:if>
77
+
78
+ <xsl:apply-templates />
79
+ </xsl:template>
80
+
81
+ <!-- don't end p tags with newlines in tables -->
82
+ <xsl:template match="table//p">
83
+ <xsl:apply-templates />
84
+ </xsl:template>
85
+
86
+ <!-- END tables -->
87
+
88
+ <xsl:template match="a[href]">
89
+ <xsl:text>[</xsl:text>
90
+ <xsl:apply-templates />
91
+ <xsl:text>](</xsl:text>
92
+ <xsl:value-of select="@href" />
93
+ <xsl:text>)</xsl:text>
94
+ </xsl:template>
95
+
96
+ <xsl:template match="img">
97
+ <xsl:text>![</xsl:text>
98
+ <xsl:value-of select="@alt" />
99
+ <xsl:text>](</xsl:text>
100
+ <xsl:value-of select="@src" />
101
+ <xsl:text>)</xsl:text>
102
+ </xsl:template>
103
+
104
+ <xsl:template match="br">
105
+ <xsl:text>
106
+ </xsl:text>
107
+ </xsl:template>
108
+
109
+
110
+ <!-- for most nodes, just dump their text content -->
111
+ <xsl:template match="*">
112
+ <xsl:text/><xsl:apply-templates /><xsl:text/>
113
+ </xsl:template>
114
+
115
+ </xsl:stylesheet>
@@ -2,7 +2,7 @@ require 'yomu'
2
2
 
3
3
  class Yomu
4
4
  def self.text_from_file(filename)
5
- IO.popen("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} -t '#{filename}'", 'r') do |io|
5
+ IO.popen("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} --html '#{filename}'", 'r') do |io|
6
6
  io.read
7
7
  end
8
8
  end
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "0.14.2"
2
+ VERSION = "0.15.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.2
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-11 00:00:00.000000000 Z
11
+ date: 2017-12-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -185,6 +185,7 @@ files:
185
185
  - lib/slaw/bylaw.rb
186
186
  - lib/slaw/collection.rb
187
187
  - lib/slaw/extract/extractor.rb
188
+ - lib/slaw/extract/html_to_akn_text.xsl
188
189
  - lib/slaw/extract/yomu_patch.rb
189
190
  - lib/slaw/generator.rb
190
191
  - lib/slaw/lifecycle_event.rb