breakout_parser 0.0.0 → 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/breakout_parser/depend +4 -0
- data/ext/breakout_parser/extconf.rb +13 -3
- data/ext/breakout_parser/parser.tab.c +2164 -0
- metadata +3 -14
- data/.gitignore +0 -16
- data/Rakefile +0 -99
- data/VERSION +0 -1
- data/ext/breakout_parser/_make.sh +0 -7
- data/ext/breakout_parser/lex.yy.o +0 -0
- data/ext/breakout_parser/main.o +0 -0
- data/ext/breakout_parser/parser +0 -0
- data/ext/breakout_parser/parser.tab.o +0 -0
- data/ext/breakout_parser/ruby_ext.o +0 -0
- data/ext/breakout_parser/test.rb +0 -3
- data/ext/breakout_parser/yywrap.o +0 -0
- data/spec/parser_examples_spec.rb +0 -101
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: breakout_parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrey "Zed" Zaikin
|
@@ -31,30 +31,19 @@ extensions:
|
|
31
31
|
extra_rdoc_files:
|
32
32
|
- LICENSE
|
33
33
|
files:
|
34
|
-
- .gitignore
|
35
34
|
- LICENSE
|
36
|
-
-
|
37
|
-
- VERSION
|
38
|
-
- ext/breakout_parser/_make.sh
|
35
|
+
- ext/breakout_parser/depend
|
39
36
|
- ext/breakout_parser/extconf.rb
|
40
37
|
- ext/breakout_parser/lex.yy.c
|
41
|
-
- ext/breakout_parser/lex.yy.o
|
42
38
|
- ext/breakout_parser/main.c
|
43
|
-
- ext/breakout_parser/main.o
|
44
39
|
- ext/breakout_parser/make_win32.bat
|
45
|
-
- ext/breakout_parser/parser
|
46
40
|
- ext/breakout_parser/parser.l
|
41
|
+
- ext/breakout_parser/parser.tab.c
|
47
42
|
- ext/breakout_parser/parser.tab.h
|
48
|
-
- ext/breakout_parser/parser.tab.o
|
49
43
|
- ext/breakout_parser/parser.y
|
50
44
|
- ext/breakout_parser/ruby_ext.c
|
51
|
-
- ext/breakout_parser/ruby_ext.o
|
52
|
-
- ext/breakout_parser/test.rb
|
53
45
|
- ext/breakout_parser/yywrap.c
|
54
|
-
- ext/breakout_parser/yywrap.o
|
55
46
|
- lib/breakout_parser.rb
|
56
|
-
- spec/parser_examples_spec.rb
|
57
|
-
- spec/parser_spec.rb
|
58
47
|
has_rdoc: true
|
59
48
|
homepage: http://assembla.com
|
60
49
|
licenses: []
|
data/.gitignore
DELETED
data/Rakefile
DELETED
@@ -1,99 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'rake'
|
3
|
-
|
4
|
-
begin
|
5
|
-
require 'jeweler'
|
6
|
-
|
7
|
-
def gen_tasks
|
8
|
-
Jeweler::Tasks.new do |gem|
|
9
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
10
|
-
gem.name = "breakout_parser"
|
11
|
-
gem.platform = ENV['PLATFORM'] if ENV['PLATFORM']
|
12
|
-
gem.summary = %Q{BreakoutParser}
|
13
|
-
gem.description = %Q{BreakoutParser}
|
14
|
-
gem.email = "zed.0xff@gmail.com"
|
15
|
-
gem.homepage = "http://assembla.com"
|
16
|
-
gem.authors = ["Andrey \"Zed\" Zaikin"]
|
17
|
-
gem.add_development_dependency "rspec", ">= 1.2.9"
|
18
|
-
gem.test_files.delete 'spec/parser_examples_spec.rb'
|
19
|
-
gem.files.delete_if{ |f| f[0..8] == 'examples/' }
|
20
|
-
gem.files.delete_if{ |f| f[0..4] == 'misc/' }
|
21
|
-
if gem.platform == 'ruby'
|
22
|
-
gem.files.include 'ext/**/*'
|
23
|
-
gem.files.delete "ext/breakout_parser/Makefile"
|
24
|
-
gem.files.delete_if{ |f| f[-3..-1] == '.so' }
|
25
|
-
else
|
26
|
-
gem.files.include 'lib/**/*.so'
|
27
|
-
gem.extensions = '.' # HACK: package no extensions
|
28
|
-
end
|
29
|
-
end
|
30
|
-
Jeweler::GemcutterTasks.new
|
31
|
-
end
|
32
|
-
gen_tasks
|
33
|
-
rescue LoadError
|
34
|
-
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
35
|
-
end
|
36
|
-
|
37
|
-
require 'spec/rake/spectask'
|
38
|
-
Spec::Rake::SpecTask.new(:spec) do |spec|
|
39
|
-
spec.libs << 'lib' << 'spec'
|
40
|
-
spec.spec_files = FileList['spec/**/*_spec.rb']
|
41
|
-
end
|
42
|
-
|
43
|
-
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
44
|
-
spec.libs << 'lib' << 'spec'
|
45
|
-
spec.pattern = 'spec/**/*_spec.rb'
|
46
|
-
spec.rcov = true
|
47
|
-
end
|
48
|
-
|
49
|
-
task :spec => :check_dependencies
|
50
|
-
|
51
|
-
task :default => :spec
|
52
|
-
|
53
|
-
require 'rake/rdoctask'
|
54
|
-
Rake::RDocTask.new do |rdoc|
|
55
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
56
|
-
|
57
|
-
rdoc.rdoc_dir = 'rdoc'
|
58
|
-
rdoc.title = "breakout_parser #{version}"
|
59
|
-
# rdoc.rdoc_files.include('README*')
|
60
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
61
|
-
end
|
62
|
-
|
63
|
-
######################
|
64
|
-
|
65
|
-
namespace :build do
|
66
|
-
desc "Build all gem variants"
|
67
|
-
task :all do
|
68
|
-
Rake::Task[ :build ].execute
|
69
|
-
|
70
|
-
@gems_to_push = []
|
71
|
-
@gems_to_push << Rake.application.jeweler.gemspec_helper.gem_path
|
72
|
-
|
73
|
-
gem = Rake.application.jeweler_tasks.gemspec
|
74
|
-
gem.files.delete_if{ |f| f[0..3] == 'ext/' }
|
75
|
-
gem.extensions = []
|
76
|
-
gem.files.include 'lib/**/*.so'
|
77
|
-
|
78
|
-
gem.original_platform = nil
|
79
|
-
gem.platform = 'x86-mingw32'
|
80
|
-
Rake::Task[ :build ].execute
|
81
|
-
@gems_to_push << Rake.application.jeweler.gemspec_helper.gem_path
|
82
|
-
|
83
|
-
gem.original_platform = nil
|
84
|
-
gem.platform = 'x86-mswin32'
|
85
|
-
Rake::Task[ :build ].execute
|
86
|
-
@gems_to_push << Rake.application.jeweler.gemspec_helper.gem_path
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
namespace 'gemcutter:release' do
|
91
|
-
desc "Release all gem variants"
|
92
|
-
task :all => 'build:all' do
|
93
|
-
@gems_to_push.each do |fname|
|
94
|
-
command = "gem push #{fname}"
|
95
|
-
puts "Executing #{command.inspect}:"
|
96
|
-
sh command
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.0.0
|
Binary file
|
data/ext/breakout_parser/main.o
DELETED
Binary file
|
data/ext/breakout_parser/parser
DELETED
Binary file
|
Binary file
|
Binary file
|
data/ext/breakout_parser/test.rb
DELETED
Binary file
|
@@ -1,101 +0,0 @@
|
|
1
|
-
describe 'BreakoutParser' do
|
2
|
-
|
3
|
-
describe "bad examples" do
|
4
|
-
Dir["examples/orig/*.bad"].sort.each do |fname|
|
5
|
-
it "should not die on #{fname} " do
|
6
|
-
data = File.read(fname)
|
7
|
-
parse_file(fname).size.should >= File.read(fname).strip.gsub(/\s+/,' ').size
|
8
|
-
end
|
9
|
-
end
|
10
|
-
end
|
11
|
-
describe "pending examples" do
|
12
|
-
Dir["examples/orig/*.pending"].sort.each do |fname|
|
13
|
-
it "should parse #{fname} "
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
describe "preparsed examples" do
|
18
|
-
Dir["examples/orig/*.txt"].sort.each do |fname|
|
19
|
-
bname = File.basename(fname)
|
20
|
-
it "should parse #{fname} " do
|
21
|
-
preparsed = File.read("examples/parsed/#{bname}")
|
22
|
-
preparsed = preparsed[3..-1] if preparsed[0..2] == '<p>'
|
23
|
-
preparsed = preparsed[0..-5] if preparsed[-4..-1] == '</p>'
|
24
|
-
preparsed.gsub!("–","-")
|
25
|
-
preparsed.gsub!("—","--")
|
26
|
-
preparsed.gsub!("‘","'")
|
27
|
-
preparsed.gsub!("’","'")
|
28
|
-
preparsed.gsub!("…","...")
|
29
|
-
preparsed.gsub!("×","x")
|
30
|
-
preparsed.gsub!("©","(c)")
|
31
|
-
preparsed.gsub!("<br />\n","<br />")
|
32
|
-
preparsed.gsub!(/[ \t]+<br \/>/,"<br />")
|
33
|
-
preparsed.gsub!("\t"," ")
|
34
|
-
if preparsed['<hr />']
|
35
|
-
# find longest dash-line in source
|
36
|
-
dashline = File.read(fname).scan(/-+/).sort_by{ |x| -x.length }.first
|
37
|
-
preparsed.gsub!("</p>\n<hr />\n<p>","<br /><br />#{dashline}<br /><br />");
|
38
|
-
end
|
39
|
-
|
40
|
-
# preparsed.gsub!(/^<p>/,"");
|
41
|
-
# preparsed.gsub!(/<\/p>$/,"");
|
42
|
-
preparsed.gsub!("</pre>\n<ol>","</pre><br /><br /><ol>")
|
43
|
-
preparsed.gsub!(/<\/p>\s+<p>/,"<br /><br />")
|
44
|
-
preparsed.gsub!("</p>\n","<br /><br />")
|
45
|
-
preparsed.gsub!("<p>","<br /><br />")
|
46
|
-
preparsed.gsub!(/[\r\n]+ */," ")
|
47
|
-
preparsed.gsub!(/[ \t]{2,}/," ")
|
48
|
-
|
49
|
-
preparsed.gsub!("<del>","-")
|
50
|
-
preparsed.gsub!("</del>","-")
|
51
|
-
preparsed.gsub!(/<br \/>[ ]+/,"<br />")
|
52
|
-
preparsed.gsub!(/(<br \/>){2,}/,"<br /><br />")
|
53
|
-
# preparsed.gsub!("<br /><ol>","<ol>")
|
54
|
-
# preparsed.gsub!("<br /><ul>","<ul>")
|
55
|
-
# preparsed.gsub!("<br /><br /><ul>","<br /><ul>")
|
56
|
-
|
57
|
-
parsed = parse_file(fname)
|
58
|
-
|
59
|
-
# old parser not parses raw text urls
|
60
|
-
#parsed.gsub!(%r'<a href="([^<>"]+)">([^<>"]+)</a>',"\\1")
|
61
|
-
|
62
|
-
t1 = parsed
|
63
|
-
t2 = preparsed
|
64
|
-
|
65
|
-
[t1,t2].each do |t|
|
66
|
-
t.downcase!
|
67
|
-
t.gsub!(/(\s*<br \/>\s*)+/,' ')
|
68
|
-
t.gsub!(/\n\s*/,"\n")
|
69
|
-
# t.gsub!(/>[ \t]+</,"><")
|
70
|
-
t.gsub!(/>[ \t]+/,">")
|
71
|
-
t.gsub!(/[ \t]+</,"<")
|
72
|
-
t.gsub!(/[\r\n \t]+/," ")
|
73
|
-
t.strip!
|
74
|
-
end
|
75
|
-
|
76
|
-
if t1 != t2
|
77
|
-
# File.open("last-parsed.tmp","w"){ |f| f << parsed }
|
78
|
-
# File.open("last-preparsed.tmp","w"){ |f| f << preparsed }
|
79
|
-
pos = 0
|
80
|
-
pos += 1 while t1[0..pos] == t2[0..pos]
|
81
|
-
pos -= 5
|
82
|
-
pos = 0 if pos<0
|
83
|
-
t1[pos..-1].should == t2[pos..-1]
|
84
|
-
end
|
85
|
-
t1.should == t2
|
86
|
-
end
|
87
|
-
$n ||= 0
|
88
|
-
$n += 1
|
89
|
-
# break if $n == 1900
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
###############################################################################
|
94
|
-
###############################################################################
|
95
|
-
###############################################################################
|
96
|
-
|
97
|
-
def parse_file fname
|
98
|
-
r = `cat #{fname} | ./parser`
|
99
|
-
r.strip
|
100
|
-
end
|
101
|
-
end
|