iudex-html 1.2.b.0-java → 1.2.b.1-java
Sign up to get free protection for your applications and to get access to all the features.
- data/History.rdoc +6 -0
- data/Manifest.txt +1 -1
- data/bin/iudex-html-clean +113 -25
- data/build/attributes +5 -3
- data/build/tags +113 -98
- data/lib/iudex-html/base.rb +1 -1
- data/lib/iudex-html/{iudex-html-1.2.b.0.jar → iudex-html-1.2.b.1.jar} +0 -0
- data/pom.xml +2 -2
- data/test/test_html_parser.rb +5 -0
- metadata +4 -4
data/History.rdoc
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
=== 1.2.b.1 (2012-5-31)
|
2
|
+
* Add support for HTML 5 (draft) tags, attributes
|
3
|
+
* Neko parser support for HTML 5 <meta charset>
|
4
|
+
* Neko parser keeps non-HTML attributes when skipBanned = false
|
5
|
+
* Add options, barc read support to iudex-html-clean
|
6
|
+
|
1
7
|
=== 1.2.b.0 (2012-3-4)
|
2
8
|
* Upgrade to gravitext-xmlprod ~> 1.5.b
|
3
9
|
* Fix duplicate attributes from Neko, last value wins.
|
data/Manifest.txt
CHANGED
data/bin/iudex-html-clean
CHANGED
@@ -19,40 +19,128 @@
|
|
19
19
|
$LOAD_PATH.unshift File.join( File.dirname(__FILE__), "..", "lib" )
|
20
20
|
|
21
21
|
require 'rubygems'
|
22
|
-
require 'rjack-logback'
|
23
22
|
|
24
|
-
|
25
|
-
require '
|
23
|
+
module IudexBinScript
|
24
|
+
require 'rjack-logback'
|
25
|
+
include RJack
|
26
26
|
|
27
|
-
|
27
|
+
Logback.config_console( :stderr => true )
|
28
28
|
|
29
|
-
require '
|
29
|
+
require 'iudex-html'
|
30
|
+
require 'iudex-filter/key_helper'
|
30
31
|
|
31
|
-
|
32
|
-
include Iudex::HTML
|
33
|
-
include Iudex::HTML::Tree
|
34
|
-
include Iudex::HTML::Filters
|
35
|
-
include Iudex::HTML::Tree::Filters
|
32
|
+
require 'gravitext-xmlprod/extensions'
|
36
33
|
|
37
|
-
|
38
|
-
import 'iudex.html.tree.TreeWalker'
|
34
|
+
require 'java'
|
39
35
|
|
40
|
-
|
41
|
-
|
42
|
-
|
36
|
+
class HTMLCleaner
|
37
|
+
include Gravitext::XMLProd
|
38
|
+
include Iudex::Core
|
39
|
+
include Iudex::HTML
|
40
|
+
include Iudex::HTML::Tree
|
41
|
+
include Iudex::HTML::Filters
|
42
|
+
include Iudex::HTML::Tree::Filters
|
43
43
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
EmptyInlineRemover.new ] )
|
44
|
+
import 'iudex.html.HTMLUtils'
|
45
|
+
import 'iudex.html.neko.NekoHTMLParser'
|
46
|
+
import 'iudex.html.tree.TreeWalker'
|
47
|
+
import 'iudex.http.Headers'
|
48
|
+
import 'iudex.util.Charsets'
|
50
49
|
|
51
|
-
|
50
|
+
def initialize
|
51
|
+
@default_encoding = "UTF-8"
|
52
|
+
@trim_non_displayed = false
|
53
|
+
@indentor = Indentor::PRETTY
|
54
|
+
end
|
55
|
+
|
56
|
+
def run( args = ARGV )
|
57
|
+
files = parse_args( args )
|
58
|
+
|
59
|
+
files.each do |f|
|
60
|
+
if f =~ /\.barc$/
|
61
|
+
process_barc( f )
|
62
|
+
else
|
63
|
+
process_file( f )
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
def process_file( fname )
|
70
|
+
|
71
|
+
input = if fname == '-'
|
72
|
+
$stdin.read
|
73
|
+
else
|
74
|
+
IO.read( fname )
|
75
|
+
end
|
76
|
+
|
77
|
+
source = HTMLUtils::source( input.to_java_bytes, @default_encoding )
|
78
|
+
process( source )
|
79
|
+
end
|
80
|
+
|
81
|
+
def process_barc( bname )
|
82
|
+
require 'iudex-barc' #FIXME: Undeclared
|
83
|
+
barc_file = Iudex::BARC::BARCFile.new( java.io.File.new( bname ) )
|
84
|
+
barc_reader = barc_file.reader
|
85
|
+
while( rec = barc_reader.next )
|
86
|
+
next unless rec.type.chr == 'H'
|
87
|
+
source = ContentSource.new( rec.body_input_stream )
|
88
|
+
ctype = Headers.content_type( rec.response_headers )
|
89
|
+
if ctype && ctype.charset
|
90
|
+
enc = Charsets.lookup( ctype.charset )
|
91
|
+
source.set_default_encoding( enc ) if enc
|
92
|
+
end
|
93
|
+
process( source )
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def process( source )
|
98
|
+
parser = NekoHTMLParser.new
|
99
|
+
parser.skip_banned = @trim_non_displayed
|
100
|
+
|
101
|
+
tree = parser.parse( source )
|
102
|
+
|
103
|
+
filters = [ XmpToPreConverter.new,
|
104
|
+
( [ CSSDisplayFilter.new,
|
105
|
+
AttributeCleaner.new ] if @trim_non_displayed ),
|
106
|
+
MojiBakeCleaner.new,
|
107
|
+
CharactersNormalizer.new,
|
108
|
+
EmptyInlineRemover.new ].flatten.compact
|
109
|
+
|
110
|
+
TreeWalker.walk_depth_first( TreeFilterChain.new( filters ),
|
111
|
+
tree )
|
112
|
+
|
113
|
+
puts tree.to_xml( :indentor => @indentor )
|
114
|
+
end
|
115
|
+
|
116
|
+
def parse_args( args = ARGV )
|
117
|
+
parser = OptionParser.new do |opts|
|
118
|
+
opts.banner =
|
119
|
+
"Usage: iudex-html-clean [options] (FILE|barc)...\n" +
|
120
|
+
"Options:\n"
|
121
|
+
|
122
|
+
opts.on( "-v", "--version", "Display version and exit" ) do
|
123
|
+
puts "iudex-html: #{Iudex::HTML::VERSION}"
|
124
|
+
exit 1
|
125
|
+
end
|
126
|
+
|
127
|
+
opts.on( "-t", "--trim-non-display",
|
128
|
+
"Trim banned/non-displayed elements from output" ) do
|
129
|
+
@trim_non_displayed = true
|
130
|
+
end
|
131
|
+
|
132
|
+
opts.on( "-i", "--indentor NAME",
|
133
|
+
"Specify indentor to use for output: " +
|
134
|
+
"PRETTY (default), COMPACT" ) do |name|
|
135
|
+
@indentor = Indentor.const_get( name.upcase.to_sym )
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
parser.parse( args )
|
141
|
+
end
|
52
142
|
|
53
|
-
puts tree.to_xml
|
54
143
|
end
|
55
144
|
|
145
|
+
HTMLCleaner.new.run
|
56
146
|
end
|
57
|
-
|
58
|
-
HTMLCleaner.new.run
|
data/build/attributes
CHANGED
@@ -21,6 +21,7 @@
|
|
21
21
|
# Sources
|
22
22
|
# http://www.w3.org/TR/xhtml11/
|
23
23
|
# http://www.w3.org/TR/html4/
|
24
|
+
# http://dev.w3.org/html5/spec/
|
24
25
|
# http://www.w3schools.com/tags/ref_standardattributes.asp
|
25
26
|
# http://xhtml.com
|
26
27
|
|
@@ -28,6 +29,7 @@ CORE :: ALL except: base head html meta param script style title
|
|
28
29
|
class ,*CORE
|
29
30
|
id ,*CORE
|
30
31
|
style ,*CORE
|
32
|
+
hidden ,*CORE, hidden element
|
31
33
|
title ,CORE, extra title
|
32
34
|
|
33
35
|
LANG :: ALL except: base br frame frameset hr iframe param
|
@@ -40,17 +42,17 @@ content ,meta, text
|
|
40
42
|
scheme ,meta, format URI
|
41
43
|
|
42
44
|
# Anchor and link attributes
|
43
|
-
charset ,a link, char_encoding of link
|
45
|
+
charset ,a link meta, char_encoding of link or (meta) document
|
44
46
|
coords ,*a, coordinates; i.e. image map
|
45
47
|
hreflang ,link, language_code of referent
|
46
48
|
href ,a base link, URL
|
47
|
-
media ,link
|
49
|
+
media ,a area link
|
48
50
|
name ,a, section_name anchor
|
49
51
|
rel ,a link
|
50
52
|
rev ,a link
|
51
53
|
shape ,*a
|
52
54
|
target ,*a *base *link
|
53
|
-
type ,link
|
55
|
+
type ,a link
|
54
56
|
|
55
57
|
# Image and some frame attributes
|
56
58
|
src ,frame img
|
data/build/tags
CHANGED
@@ -17,6 +17,7 @@
|
|
17
17
|
# Sources
|
18
18
|
# http://www.w3.org/TR/xhtml11/
|
19
19
|
# http://www.w3.org/TR/html4/
|
20
|
+
# http://dev.w3.org/html5/spec/
|
20
21
|
# http://www.w3schools.com/tags/default.asp
|
21
22
|
# http://xhtml.com/
|
22
23
|
#
|
@@ -25,106 +26,120 @@
|
|
25
26
|
# S :: In Strict HTML 4.01/XHTML 1.0
|
26
27
|
# T :: In Transitional HTML 4.01/XHTML 1.0
|
27
28
|
# F :: In frameset annex
|
29
|
+
# 5 :: HTML5 new elements
|
28
30
|
# D :: Deprecated
|
29
31
|
# I :: Inline elements (Note <br/> is not labeled inline.)
|
30
32
|
# M :: Metadata elements (content not visible text), i.e. head
|
31
33
|
# B :: Banned/blacklisted elements from which text should not be extracted.
|
32
34
|
|
33
|
-
a , S T F I , anchor
|
34
|
-
abbr , S T F I , abbreviation
|
35
|
-
acronym , S T F
|
36
|
-
address , S T F , contact information for the author or owner
|
37
|
-
applet , T F
|
38
|
-
area ,E S T F , area inside an image-map
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
35
|
+
a , S T F 5 I , anchor
|
36
|
+
abbr , S T F 5 I , abbreviation
|
37
|
+
acronym , S T F I , acronym
|
38
|
+
address , S T F 5 , contact information for the author or owner
|
39
|
+
applet , T F D , embedded applet
|
40
|
+
area ,E S T F 5 , area inside an image-map
|
41
|
+
article , 5 , Structure: An independent content element
|
42
|
+
aside , 5 , Structure: Tengentially related content
|
43
|
+
b , S T F 5 I , bold text
|
44
|
+
base ,E S T F 5 M , default address or target for all links on a page
|
45
|
+
basefont ,E T F D I M , default font; color; or size for the text in a page
|
46
|
+
bdi , 5 I , Text isolated from surrounding for BIDI formatting
|
47
|
+
bdo , S T F 5 I , the text direction
|
48
|
+
big , S T F I , big text
|
49
|
+
blockquote , S T F 5 , long quotation
|
50
|
+
body , S T F 5 , the document's body
|
51
|
+
br ,E S T F 5 , single line break
|
52
|
+
button , S T F 5 I B, push button
|
53
|
+
caption , S T F 5 , table caption
|
54
|
+
center , T F D , centered text
|
55
|
+
cite , S T F 5 I , citation
|
56
|
+
code , S T F 5 I , computer code text
|
57
|
+
col ,E S T F 5 , attribute values for one or more columns in a table
|
58
|
+
colgroup , S T F 5 , group of columns in a table for formatting
|
59
|
+
dd , S T F 5 , description of a term in a definition list
|
60
|
+
del , S T F 5 I , deleted text
|
61
|
+
dfn , S T F 5 I , definition term
|
62
|
+
dir , T F D , directory list
|
63
|
+
div , S T F 5 , section in a document
|
64
|
+
dl , S T F 5 , definition list
|
65
|
+
dt , S T F 5 , term (an item) in a definition list
|
66
|
+
em , S T F 5 I , emphasized text
|
67
|
+
fieldset , S T F 5 B, border around elements in a form
|
68
|
+
figcaption , 5 , Structure: A figure caption
|
69
|
+
figure , 5 , Structure: Self contained content that can be moved.
|
70
|
+
font , T F D I , font; color; or size for text
|
71
|
+
footer , 5 , Structure: A footer of a section
|
72
|
+
form , S T F 5 , form for user input
|
73
|
+
frame ,E F B, window (a frame) in a frameset
|
74
|
+
frameset , F B, set of frames
|
75
|
+
h1 , S T F 5 , heading level 1
|
76
|
+
h2 , S T F 5 , heading level 2
|
77
|
+
h3 , S T F 5 , heading level 3
|
78
|
+
h4 , S T F 5 , heading level 4
|
79
|
+
h5 , S T F 5 , heading level 5
|
80
|
+
h6 , S T F 5 , heading level 6
|
81
|
+
head , S T F 5 M , information about the document
|
82
|
+
header , 5 , Structure: A header of a section
|
83
|
+
hgroup , 5 , Structure: A group of headings
|
84
|
+
hr ,E S T F 5 , horizontal line
|
85
|
+
html , S T F 5 , document
|
86
|
+
i , S T F 5 I , italic text
|
87
|
+
iframe , T F 5 , nline frame
|
88
|
+
img ,E S T F 5 I , image
|
89
|
+
input ,E S T F 5 I B, input control
|
90
|
+
ins , S T F 5 I , inserted text
|
91
|
+
isindex , T F D , searchable index related to a document
|
92
|
+
kbd , S T F 5 I , keyboard text
|
93
|
+
label , S T F 5 I B, label for an input element
|
94
|
+
legend , S T F 5 B, caption for a fieldset element
|
95
|
+
li , S T F 5 , list item
|
96
|
+
link ,E S T F 5 M , relationship between a document and an external resource
|
97
|
+
map , S T F 5 I , image-map
|
98
|
+
mark , 5 I , Text marked/highlighted for reference purposes
|
99
|
+
menu , T F 5 D , menu list
|
100
|
+
meta ,E S T F 5 M , metadata
|
101
|
+
nav , 5 , Structure: container for navigational links
|
102
|
+
noframes , T F B, alternate content where frames not supported
|
103
|
+
noscript , S T F 5 B, alternate content script not supported
|
104
|
+
object , S T F 5 I B, embedded object
|
105
|
+
ol , S T F 5 , ordered list
|
106
|
+
optgroup , S T F 5 B, group of related options in a select list
|
107
|
+
option , S T F 5 B, option in a select list
|
108
|
+
p , S T F 5 , paragraph
|
109
|
+
param ,E S T F 5 , parameter for an object
|
110
|
+
pre , S T F 5 , preformatted text
|
111
|
+
q , S T F 5 I , short quotation
|
112
|
+
rb , 5 , ruby base text
|
113
|
+
rbc , 5 , ruby base container (complex)
|
114
|
+
rp , 5 , ruby simple text container
|
115
|
+
rt , 5 , ruby annotation text
|
116
|
+
rtc , 5 , ruby text container (complex)
|
117
|
+
ruby , 5 I , ruby pronunciation aid
|
118
|
+
s , T F 5 D I , strikethrough text
|
119
|
+
samp , S T F 5 I , sample computer code
|
120
|
+
script , S T F 5 I B, client-side script
|
121
|
+
section , 5 , Structure: generic document/application section
|
122
|
+
select , S T F 5 I B, select list (drop-down list)
|
123
|
+
small , S T F 5 I , small text
|
124
|
+
span , S T F 5 I , section in a document
|
125
|
+
strike , T F D I , strikethrough text
|
126
|
+
strong , S T F 5 I , strong text
|
127
|
+
style , S T F 5 B, style information for a document
|
128
|
+
sub , S T F 5 I , subscripted text
|
129
|
+
sup , S T F 5 I , superscripted text
|
130
|
+
table , S T F 5 , table
|
131
|
+
tbody , S T F 5 , Groups the body content in a table
|
132
|
+
td , S T F 5 , cell in a table
|
133
|
+
textarea , S T F 5 I B, multi-line text input control
|
134
|
+
tfoot , S T F 5 , Groups the footer content in a table
|
135
|
+
th , S T F 5 , header cell in a table
|
136
|
+
thead , S T F 5 , Groups the header content in a table
|
137
|
+
time , 5 I , A date or time
|
138
|
+
title , S T F 5 M , the title of a document
|
139
|
+
tr , S T F 5 , row in a table
|
140
|
+
tt , S T F I , teletype text
|
141
|
+
u , T F 5 D I , underlined text
|
142
|
+
ul , S T F 5 , unordered list
|
143
|
+
var , S T F 5 I , variable part of a text
|
144
|
+
wbr ,E 5 I , A line break opportunity
|
145
|
+
xmp , D , preformatted text
|
data/lib/iudex-html/base.rb
CHANGED
Binary file
|
data/pom.xml
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
<groupId>iudex</groupId>
|
4
4
|
<artifactId>iudex-html</artifactId>
|
5
5
|
<packaging>jar</packaging>
|
6
|
-
<version>1.2.b.
|
6
|
+
<version>1.2.b.1</version>
|
7
7
|
<name>Iudex HTML parsing/filtering and text extraction</name>
|
8
8
|
|
9
9
|
<parent>
|
@@ -24,7 +24,7 @@
|
|
24
24
|
<dependency>
|
25
25
|
<groupId>com.gravitext</groupId>
|
26
26
|
<artifactId>gravitext-xmlprod</artifactId>
|
27
|
-
<version>[1.5,1.5.9999)</version>
|
27
|
+
<version>[1.5.1,1.5.9999)</version>
|
28
28
|
</dependency>
|
29
29
|
|
30
30
|
<dependency>
|
data/test/test_html_parser.rb
CHANGED
@@ -48,6 +48,11 @@ HTML
|
|
48
48
|
assert_doc( alt, parse( alt, "UTF-8" ) )
|
49
49
|
end
|
50
50
|
|
51
|
+
def test_meta_charset_rerun
|
52
|
+
alt = HTML_META.sub( /<meta .*\/>/, '<meta charset="utf-8"/>' )
|
53
|
+
assert_doc( alt, parse( alt, "ISO-8859-1" ) )
|
54
|
+
end
|
55
|
+
|
51
56
|
HTML_SKIP_TAGS = <<HTML
|
52
57
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
53
58
|
<head>
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: iudex-html
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease: 4
|
5
|
-
version: 1.2.b.
|
5
|
+
version: 1.2.b.1
|
6
6
|
platform: java
|
7
7
|
authors:
|
8
8
|
- David Kellum
|
@@ -10,7 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2012-
|
13
|
+
date: 2012-06-01 00:00:00 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: iudex-core
|
@@ -41,7 +41,7 @@ dependencies:
|
|
41
41
|
requirements:
|
42
42
|
- - ~>
|
43
43
|
- !ruby/object:Gem::Version
|
44
|
-
version: 1.5.
|
44
|
+
version: 1.5.1
|
45
45
|
requirement: *id003
|
46
46
|
prerelease: false
|
47
47
|
type: :runtime
|
@@ -117,7 +117,7 @@ files:
|
|
117
117
|
- test/test_stax_parser.rb
|
118
118
|
- test/test_tree_walker.rb
|
119
119
|
- test/test_word_counters.rb
|
120
|
-
- lib/iudex-html/iudex-html-1.2.b.
|
120
|
+
- lib/iudex-html/iudex-html-1.2.b.1.jar
|
121
121
|
homepage: http://github.com/dekellum/iudex
|
122
122
|
licenses: []
|
123
123
|
|